Source code for ngram.Corpus

# -*- coding: utf-8 -*-
# 
# The MIT License (MIT)
# 
# (C) முத்தையா அண்ணாமலை 2013-2015
# 

import codecs
from tamil import utf8


[docs]class Corpus: """ Class defines a Corpus data file, and reading information from this file for only the Tamil letters """ def __init__(self,filename): self.filename = filename self.handle = None def __del__(self): try: self.handle.close() except Exception: pass
[docs] def next_tamil_letter(self): self.handle = codecs.open(self.filename,'r','utf-8') for letter in utf8.get_letters_iterable(self.handle.read()): if ( utf8.istamil( letter ) ): yield letter return