Source code for solthiruthi.dom
## -*- coding: utf-8 -*-
## (C) 2015 Muthiah Annamalai,
##
from __future__ import print_function
import abc
import codecs
import re
from tamil import utf8
from .datastore import Queue
# DOM for documents
[docs]class Position:
__metaclass__ = abc.ABCMeta
def __init__(self,row,col):
self.row = row
self.col = col
[docs]class Entity(Position):
__metaclass__ = abc.ABCMeta
def __init__(self,word,flagged=False,**kwargs):
super(Entity,self).__init__(**kwargs)
self.flagged = flagged
self.word = word
self.letters = utf8.get_letters(word)
[docs] def isFlagged(self):
return self.flagged
[docs] @abc.abstractmethod
def isWord(self):
pass
[docs] def getLetters(self):
return self.letters
[docs]class WordEntity(Entity):
def __init__(self,word,**kwargs):
super(WordEntity,self).__init__(word=word,**kwargs)
[docs] def isWord(self):
return True
[docs]class NonEntity(Entity,Position):
def __init__(self,word,**kwargs):
super(NonEntity,self).__init__(word=word,**kwargs)
[docs] def isWord(self):
return False
[docs]class Document(Queue):
"open contents of a file on load"
def __init__(self,filename):
self.filename = filename
with codecs.open(filename,'r','utf-8') as fileobj:
self.text = fileobj.readlines()
super(Document,self).__init__()
[docs] def tokenize(self):
spc = re.compile('[\ \t\r]+')
idx = 1
LEN = len(self.text)
prev = None
for row,line in enumerate(self.text):
re.search() # looks useful
if self.isempty():
raise Exception("Empty File: Cannot be tokenized")