Source code for tamil.txt2unicode.encode2unicode

#!/usr/bin/env python
# -*- coding: utf-8 -*-

##############################################################################
# (C) 2014 Arulalan.T <[email protected]>                                  #
#                                                                            #
# Author : Arulalan.T <[email protected]>                                  #
# Date : 04.08.2014                                                          #
#                                                                            #
# This file is part of open-tamil/txt2uni                                    #
#                                                                            #
# txt2uni is free software: you can redistribute it and/or                   #
# modify it under the terms of the GNU General Public License as published by#
# the Free Software Foundation, either version 3 of the License, or (at your #
# option) any later version. This program is distributed in the hope that it #
# will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty#
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General#
# Public License for more details. You should have received a copy of the GNU#
# General Public License along with this program. If not, see                #
# <http://www.gnu.org/licenses/>.                                            #
#                                                                            #
##############################################################################
from sys import version
PYTHON3 = version > '3'
del version

try:
    # python 2
    from .orddic import OrderedDict
except ImportError as ime:
    # python 3
    from collections import OrderedDict

from .encode2utf8 import anjal2utf8, bamini2utf8, boomi2utf8, \
    dinakaran2utf8, dinamani2utf8, dinathanthy2utf8, \
    kavipriya2utf8, murasoli2utf8, mylai2utf8, nakkeeran2utf8, \
    roman2utf8, tab2utf8, tam2utf8, tscii2utf8, pallavar2utf8, \
    indoweb2utf8, koeln2utf8, libi2utf8, oldvikatan2utf8, webulagam2utf8, \
    diacritic2utf8, shreelipi2utf8, softview2utf8, tace2utf8, vanavil2utf8, \
    indica2utf8, anu2utf8, shreelipiavid2utf8

__all__ = ['anjal2unicode', 'bamini2unicode', 'boomi2unicode', 
    'dinakaran2unicode', 'dinathanthy2unicode', 'kavipriya2unicode',
    'murasoli2unicode', 'mylai2unicode', 'nakkeeran2unicode',
    'roman2unicode', 'tab2unicode', 'tam2unicode', 'tscii2unicode',
    'indoweb2unicode', 'koeln2unicode', 'libi2unicode', 'oldvikatan2unicode',
    'webulagam2unicode', 'auto2unicode', 'dinamani2unicode', 
    'pallavar2unicode', 'diacritic2unicode', 'shreelipi2unicode',
    'softview2unicode', 'tace2unicode', 'vanavil2unicode', 'indica2unicode',
    'anu2unicode', 'shreelipiavid2unicode']
    
_all_encodes_ = OrderedDict([('anjal2utf8', anjal2utf8), 
     ('bamini2utf8', bamini2utf8), ('boomi2utf8', boomi2utf8), 
     ('dinakaran2utf8', dinakaran2utf8), ('dinamani2utf8', dinamani2utf8), 
     ('dinathanthy2utf8', dinathanthy2utf8), 
     ('kavipriya2utf8', kavipriya2utf8), ('murasoli2utf8', murasoli2utf8),
     ('mylai2utf8', mylai2utf8), ('nakkeeran2utf8', nakkeeran2utf8),
     ('roman2utf8', roman2utf8), ('tab2utf8', tab2utf8),
     ('tam2utf8', tam2utf8), ('tscii2utf8', tscii2utf8), 
     ('pallavar2utf8', pallavar2utf8), ('indoweb2utf8', indoweb2utf8),
     ('koeln2utf8', koeln2utf8), ('libi2utf8', libi2utf8),
     ('oldvikatan2utf8', oldvikatan2utf8), ('webulagam2utf8', webulagam2utf8),    
     ('diacritic2utf8', diacritic2utf8), ('shreelipi2utf8', shreelipi2utf8),
     ('softview2utf8', softview2utf8),  ('tace2utf8', tace2utf8),
     ('vanavil2utf8', vanavil2utf8), ('indica2utf8', indica2utf8)
     ])


# By enable this flage, it will write individual encodes unique & common
# characters in text file.
__WRITE_CHARS_TXT = False


def encode2unicode(text, charmap):
    '''
    charmap : dictionary which has both encode as key, unicode as value
    '''
    if isinstance(text, (list, tuple)):
        unitxt = ''
        for line in text:
            for key,val in charmap.items():
                if key in line:
                    line = line.replace(key, val)
                # end of if key in text:
            unitxt += line
        # end of for line in text:
        return unitxt
    elif isinstance(text, str):
        for key,val in charmap.items():
            if key in text:
                text = text.replace(key, val)
        return text
    raise Exception("Unexpected input kind. Must be string or list or tuple")

[docs]def anjal2unicode(text): return encode2unicode(text, anjal2utf8)
[docs]def bamini2unicode(text): return encode2unicode(text, bamini2utf8)
[docs]def boomi2unicode(text): return encode2unicode(text, boomi2utf8)
[docs]def dinakaran2unicode(text): return encode2unicode(text, dinakaran2utf8)
[docs]def dinamani2unicode(text): return encode2unicode(text, dinamani2utf8)
[docs]def dinathanthy2unicode(text): return encode2unicode(text, dinathanthy2utf8)
[docs]def kavipriya2unicode(text): return encode2unicode(text, kavipriya2utf8)
[docs]def murasoli2unicode(text): return encode2unicode(text, murasoli2utf8)
[docs]def mylai2unicode(text): return encode2unicode(text, mylai2utf8)
[docs]def nakkeeran2unicode(text): return encode2unicode(text, nakkeeran2utf8)
[docs]def roman2unicode(text): return encode2unicode(text, roman2utf8)
[docs]def tab2unicode(text): return encode2unicode(text, tab2utf8)
[docs]def tam2unicode(text): return encode2unicode(text, tam2utf8)
[docs]def tscii2unicode(text): return encode2unicode(text, tscii2utf8)
[docs]def pallavar2unicode(text): return encode2unicode(text, pallavar2utf8)
[docs]def indoweb2unicode(text): return encode2unicode(text, indoweb2utf8)
[docs]def koeln2unicode(text): return encode2unicode(text, koeln2utf8)
[docs]def libi2unicode(text): return encode2unicode(text, libi2utf8)
[docs]def oldvikatan2unicode(text): return encode2unicode(text, oldvikatan2utf8)
[docs]def webulagam2unicode(text): return encode2unicode(text, webulagam2utf8)
[docs]def diacritic2unicode(text): return encode2unicode(text, diacritic2utf8)
[docs]def shreelipi2unicode(text): return encode2unicode(text, shreelipi2utf8)
[docs]def softview2unicode(text): return encode2unicode(text, softview2utf8)
[docs]def tace2unicode(text): return encode2unicode(text, tace2utf8)
[docs]def vanavil2unicode(text): return encode2unicode(text, vanavil2utf8)
[docs]def indica2unicode(text): return encode2unicode(text, indica2utf8)
[docs]def anu2unicode(text): return encode2unicode(text, anu2utf8)
[docs]def shreelipiavid2unicode(text): return encode2unicode(text, shreelipiavid2utf8)
def _get_unique_ch(text, all_common_encodes): """ text : encode sample strings returns unique word / characters from input text encode strings. """ unique_chars = '' if isinstance(text, str): text = text.split("\n") elif isinstance(text, (list, tuple)): pass special_chars = ['.', ',', ';', ':','', ' ', '\r', '\t', '=', '\n'] for line in text: for word in line.split(' '): if ( not PYTHON3 ): word = word.decode( 'utf-8') for ch in all_common_encodes: if ch in word: word = word.replace(ch, '') # end of for ch in _all_common_encodes_: # if len of word is zero, then go for another word if not word: continue for ch in word: if ch.isdigit() or ch in special_chars: # remove special common chars word = word.replace(ch, '') continue # end of if ch.isdigit() or ...: # Whola, got unique chars from user passed text return word # end of for ch in word: # end of for word in line.split(' '): # end of for line in text: return '' # end of def get_unique_ch(text): def _get_unique_common_encodes(): """ This function will return both unique_encodes and common_encodes as tuple. unique_encodes : In dictionary with encodes name as key and its corresponding encode's unique characters among other available encodes. common_encodes : In set type which has all common encode compound characters from all available encodes. i.e. removed common encode single characters Author : Arulalan.T 04.08.2014 """ _all_unique_encodes_ = [] _all_unicode_encodes_ = {} _all_common_encodes_ = set([]) _all_common_encodes_single_char_ = set([]) for name, encode in _all_encodes_.items(): encode_utf8 = set([PYTHON3 and ch or ch.decode( 'utf-8') for ch in encode.keys()]) _all_unicode_encodes_[name] = encode_utf8 _all_unique_encodes_full_ =_all_unicode_encodes_.copy() for supname, super_encode in _all_unicode_encodes_.items(): for subname, sub_encode in _all_unicode_encodes_.items(): if supname == subname: continue # get unique of super_encode among other encodings super_encode = super_encode - sub_encode # get common for all over encodings common = _all_unique_encodes_full_[supname] - super_encode # merge common to all encodings common _all_common_encodes_ = _all_common_encodes_.union(common) # store super_encode's unique keys with its name _all_unique_encodes_.append((supname, super_encode)) for ch in _all_common_encodes_: # collect single common chars if len(ch) == 1: _all_common_encodes_single_char_.add(ch) # end of for ch in _all_common_encodes_: # remove single common char from compound common chars _all_common_encodes_ -= _all_common_encodes_single_char_ if __WRITE_CHARS_TXT: # write common compound characters of all encodes f = open('all.encodes.common.chars.txt', 'w') for ch in _all_common_encodes_: ch = ch.encode('utf-8') for encode_keys in _all_encodes_.values(): if ch in encode_keys: uni = encode_keys[ch] break # end of if ch in encode_keys: # end of for encode_keys in _all_encodes_.values(): f.write(ch + ' => ' + uni + '\n') # end of for ch in _all_common_encodes_: f.close() # write unique compound characters of all encodes for encode_name, encode_keys in _all_unique_encodes_: f = open(encode_name + '.unique.chars.txt', 'w') for ch in encode_keys: ch = ch.encode('utf-8') uni = _all_encodes_[encode_name][ch] f.write(ch + ' => ' + uni + '\n') # end of for ch in encode_keys: f.close() # end of for encode_name, encode_keys in _all_unique_encodes_: # end of if __WRITE_CHARS_TXT: return (_all_unique_encodes_, _all_common_encodes_) # end of def _get_unique_common_encodes():
[docs]def auto2unicode(text): """ This function tries to identify encode in available encodings. If it finds, then it will convert text into unicode string. Author : Arulalan.T 04.08.2014 """ _all_unique_encodes_, _all_common_encodes_ = _get_unique_common_encodes() # get unique word which falls under any one of available encodes from # user passed text lines unique_chars = _get_unique_ch(text, _all_common_encodes_) # count common encode chars clen = len(_all_common_encodes_) msg = "Sorry, couldn't find encode :-(\n" msg += 'Need more words to find unique encode out side of %d ' % clen msg += 'common compound characters' if not unique_chars: print(msg) return '' # end of if not unique_chars: for encode_name, encode_keys in _all_unique_encodes_: if not len(encode_keys): continue for ch in encode_keys: # check either encode char is presnent in word if ch in unique_chars: # found encode print(("Found encode : ", encode_name)) encode = _all_encodes_[encode_name] return encode2unicode(text, encode) # end of if ch in unique_chars: # end of ifor ch in encode_keys: else: print(msg) return ''
# end of for encode in _all_unique_encodes_: # end of def auto2unicode(text):