Source code for tamil.txt2ipa.ipaconvert

#!/usr/bin/python
# -*- coding: utf-8 -*-

##############################################################################
# (C) 2014 Arulalan.T <[email protected]>                                  #
#                                                                            #
# Written By : Arulalan.T <[email protected]>                              #
# Date : 02.08.2014                                                          #
#                                                                            #
# This file is part of oepn-tamil/txt2ipa                                    #
#                                                                            #
# txt2ipa is free software: you can redistribute it and/or                   #   
# modify it under the terms of the GNU General Public License as published by#
# the Free Software Foundation, either version 3 of the License, or (at your #
# option) any later version. This program is distributed in the hope that it #
# will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty#
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General#
# Public License for more details. You should have received a copy of the GNU#
# General Public License along with this program. If not, see                #   
# <http://www.gnu.org/licenses/>.                                            #   
#                                                                            #
##############################################################################

import re 


# Convert Tamil text into romanized encoding using transliteratoin.php before apply any 
#of the functions below 

[docs]def ipa(text): # Generates narrow transcription of Tamil texts text=" " + text + " " # text = """ %s """ % text # Move Punctuations repl = lambda m: " " + m.group(1) + " " text = re.sub("([\\,\\\,\.\!\?\"\'\"\(\)])", repl, text) # text = re.sub("/(?<=[\w])([\\,\.\!\?\"\'\"\(\)])/",repl,text) # text = re.sub("/([\\,\.\!\?\"\'\"\(\)])(?=[\w])/",repl,text) # The comments below refer to the implementation of transcription rules as described in the # book - Panniru Thirumurai Olipeyarppu by Punal K Murugaiyan # Dipthongs text = text.replace("ai","ay") # 5.3.3.1 ii Palatal Approximant ... # ai text = text.replace("au","av") # au - dipthong replacement # Grantha text = text.replace("j","ʤ") text = text.replace("h","ɦ") text = text.replace("S","ʂ") text = text.replace("srI","ʂrI") # Mey # Vallinam # pa text = re.sub(r"(?<=[aAiIuUeEoO])p(?=[aAiIuUeEoO])","β",text) # 5.3.1.6 iii Voiced Bilabial Fricative text = re.sub(r"(?<=[yrlvZL])p","β",text) # 5.3.1.6 iii text = re.sub(r"(?<=[GJnVmN])p","b",text) #5.3.1.6 ii voiced bilabial plosive # 5.3.1.6 i no replacement # ta text = re.sub(r"(?<=[aAiIuUeEoOyrlvZL])t(?=[aAiIuUeEoO])","ð",text) # 5.3.1.5 iii Voiced dental Fricative text = re.sub(r"(?<=[nV])t","d̪",text) # 5.3.1.5 ii Voiced dental plosive text = re.sub(r"t","t̪",text) # 5.3.1.5 i Voiceless dental plosive # Ra text = text.replace("XX","t̺t̺ʳ") # 5.3.1.4 ii & ii find correct name text = re.sub(r"(?<=V)X","d̺ʳ",text) # 5.3.1.4 iii # 5.3.1.4 iv & v implemented in idaiyinam section # Ta text = re.sub(r"(?<=[aAiIuUeEoO])T(?=[aAiIuUeEoO])","ɽ",text) # 5.3.1.3 iii Retroflex Flap text = re.sub(r"(?<=[N])T","ɖ",text) # 5.3.1.3 ii Voiced Retroflex Plosive | VT ? text = text.replace("T","ʈ") # 5.3.1.3 i Voiceless Retroflex Plosive # ca text = re.sub(r"(?<=[aAiIuUeEoOl])c(?=[aAiIuUeEoO])","s",text) # 5.3.1.2 iii voiceless alveolar fricatives repl = lambda m: m.group(1) + "s" text = re.sub(r"(\s)c",repl,text) # 5.3.1.2 iii text = re.sub(r"(V)c",repl,text) text = re.sub(r"(?<=[J])c","ʤ",text) # 5.3.1.2 ii Voiced Post Alveolar affricate - Symbol Changed : d͡ʒ text = re.sub(r"c","ʧ",text) # 5.3.1.2 i Voicless Post Alveolar Affrivate - Symbol Changed : t͡ʃ # ka text = re.sub(r"Gk(?=[iI])","ŋʲgʲ",text) # 5.3.2.1 ii Palatized Velar Nasal text = text.replace("Gk","ŋg") # 5.3.2.1 Velar Nasal text = re.sub(r"(?<=[aAiIuUeEoO])k(?=[iI])","ç",text) # 5.3.1.1 viii voiceless palatal fricative #yrZlLv assumed above. Missing in definiation : eykiya -> eyçiya aarkiya -> aarçiya.... text = re.sub(r"(?<=r)k(?=[aAuUeEoO])","ɣ",text) # 5.3.1.1 Vii Voiced Velar Fricative text = re.sub(r"(?<=[aAiIuUeEoO])k(?=[aAuUeEoO])","x",text) # 5.3.1.1 vi Voicless Velar Fricative text = re.sub(r"(?<=[ylvZL])k(?=[aAuUeEoO])","x",text) # above text = re.sub(r"ykk","jcc",text) # 5.3.1.1 v voiceless palatal plosive text = re.sub(r"jkk","jcc",text) # above text = re.sub(r"(?<=[rylvZLGVNaAiIuUeEoO])k(?=[iI])","gʲ",text) # 5.3.1.1 iv Voiced Palatized Velar Plosive text = re.sub(r"(?<=[NVmn])k(?=[aAuUeEoO])","g",text) # 5.3.1.1 iii voiced velar plosive text = re.sub(r"(?<=k)k(?=[iI])","kʲ",text) # 5.3.1.1 ii Voiceless velar plosive # 5.3.1.1 i no relacement # # Idaiyinam text = text.replace("Z","ɻ") #5.3.3.6 Retroflex Approximant text = re.sub(r"(?<=[aAiIuUeEoO])L(?=[aAiIuUeEoO])","ɭʼ",text) #5.3.3.5 i Lateral Approximant - Ejective text = text.replace("L","ɭ") # 5.3.3.5 ii Lateral Approximant # 5.3.3.4 no change text = re.sub(r"(?<=[aAiIuUeEoO])[rX](?=[aAiIuUeEoO])","ɾ",text) # 5.3.3.3 i Alveolar Tap # 5.3.3.3 ii - pure consonant r - no replacement text = re.sub(r"X(?!=[aAiIuUeEoO])","r",text) # 5.3.3.3 ii Trill text = re.sub(r"(?<=[aAiIuUeEoO])v(?=[aAiIuUeEoO])","ʋ",text) # 5.3.3.2 ii labio-dental approximant text = re.sub(r"(\s)v(?=[aAiIuUeEoO])",lambda m: m.group(1)+"ʋ",text) # 5.3.3.2 ii text = text.replace("vv","ʊ̯ʋ") # 5.3.3.2 i near-close near-back rounded vowel - part of a dipthong text = text.replace("v","ʋ") text = re.sub(r"yy","jɪ̯",text) # 5.3.3.1 i near-close near-front unrounded vowel - part of a dipthong text = re.sub(r"y","ɪ̯",text) # 5.3.3.1 i near-close near-front unrounded vowel - part of a dipthong # Mellinam # 5.3.2.6 no replacement text = re.sub(r"[Vn]","n̺",text) # 5.3.2.4 Alveolar Nasal (Check Actual name in Wikipedia) text = text.replace("n̺d̪","n̪d̪") # 5.3.2.5 Dental Nasal text = re.sub(r"(?<=[aAiIuUeEoO])N(?=[aAiIuUeEoO])","ɳʼ",text) # 5.3.2.3 ii Retroflex Nasal Ejective text = text.replace("N","ɳ") # 5.3.2.3 Retroflex Nasal text = text.replace("J","ɲ") # 5.3.2.3 Palatal Nasal text = re.sub(r"GG(?=[iI])","ŋʲŋʲ",text) # Assumed based on above text = text.replace("GG","ŋŋ") # Assumed based on above text = text.replace("G","ŋ") # Assumed based on above # Uyir # Seperate Pure Vowel Combinations text = re.sub(r"([aAiIuUeEoO])([aAiIuUeEoO])", lambda m: m.group(1)+"_"+m.group(2),text) #return text # Long O text=re.sub(r"o(\s)",lambda m: "o·"+m.group(1),text) # 5.2.5.2 v text=re.sub(r"(\s)o(?!·)",lambda m: m.group(1)+"ʷoː",text) # 5.2.5.2 iii text=re.sub(r"_o(?!·)","ʷoː",text) # 5.2.5.2 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·])o(?![ː·])","oː",text) # 5.2.5.2 i # Short o text=re.sub(r"(\s)O(?!·)",lambda m: m.group(1)+"ʷo̞",text) # 5.2.5.1 iii text=re.sub(r"_O(?!·)","ʷo̞",text) # 5.2.5.1 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞])O(?![ː·])","o̞",text) # 5.2.5.1 i # Adding extra symbol for Retroflex Consonants retroflex = ["ɽ","ɖ","ʈ","ɳ","ɭ","ɻ"] for rf in retroflex: text = re.sub("/̞(?=" + rf + ")","̞˞",text) # Long e text=re.sub(r"e(\s)",lambda m: "e·"+m.group(1),text) # 5.2.4.2 v text=re.sub(r"(\s)e(?!·)",lambda m: m.group(1)+"ʲeː",text) # 5.2.4.2 iii text=re.sub(r"_e(?!·)","ʲeː",text) # 5.2.4.2 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞])e(?![ː·])","eː",text) # 5.2.5.2 i # short e text=re.sub(r"(\s)E(?!·)",lambda m: m.group(1)+"ʲɛ̝",text) # 5.2.4.1 iii text=re.sub(r"_E(?!·)","ʲɛ̝",text) # 5.2.4.1 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞̝])E(?![ː·])","ɛ̝",text) # 5.2.5.4 i # Adding extra symbol for Retroflex Consonants for rf in retroflex: text = re.sub("/̝(?=" + rf + ")","̝˞",text) # short u text = re.sub(r"(?<!u)(\S)(?<![bʋpmβ])u",lambda m: m.group(1)+"ɨ",text) # 5.2.3.1 v text=re.sub(r"(\s)u(?!·)",lambda m: m.group(1)+"ʷʊ",text) # 5.2.3.1 iii text=re.sub(r"_u(?!·)","ʷʊ",text) # 5.2.3.1 iii - puththi_Ottum text = re.sub(r"(?<!u\S)([bʋpmβ])u",lambda m: m.group(1)+"ʉ̩",text) # 5.2.3.1 Vii text = re.sub(r"([bʋpβm])u",lambda m: m.group(1)+"ʉ̩",text) # 5.2.3.1 Vii text = re.sub(r"(?<![bʋpβm])u(\s)",lambda m: "ɨ"+m.group(1),text) # 5.2.3.1 v repl = lambda m: m.group(1) + m.group(2) + "ʊ" text= re.sub(r"(\s)(\S)(ɨ|ʉ̩)",repl,text) # 5.2.5.2 i text= re.sub(r"(U)(\S{1,2})ɨ",repl,text) # 5.2.5.2 i text= re.sub(r"(ʊ)(\S{1,2})ɨ",repl,text) text= re.sub(r"(ʊ)(\S{1,2})ʉ̩",repl,text) text = re.sub(r"(?<![bʋβpm])ʊ(\s)",lambda m: "ɨ"+m.group(1),text) # 5.2.3.1 v text = re.sub(r"(?<=[bʋβpm])ʊ(\s)",lambda m: "ʉ̩"+m.group(1),text) for rf in retroflex: text = re.sub(r"ʊ(?=" + rf + ")","ʊ˞",text) for rf in retroflex: text = re.sub(r"ʉ̩(?=" + rf + ")","ʉ̩˞",text) for rf in retroflex: text = re.sub(r"ɨ(?=" + rf + ")","ɨ˞",text) # text = re.sub(r"\S(<=!u)\Su","ɨ",text) # 5.2.3.1 v # Long u text=re.sub(r"U(\s)",lambda m: "u·"+m.group(1),text) # 5.2.3.2 v text=re.sub(r"(\s)U(?!·)",lambda m: m.group(1)+"ʷuː",text) # 5.2.3.2 iii text=re.sub(r"_U(?!·)","ʷuː",text) # 5.2.3.2 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞̝])U(?![ː·])","uː",text) # 5.2.3.2 i # short i text=re.sub(r"i(\s)",lambda m: "ɪ·"+m.group(1),text) # 5.2.2.1 iii text=re.sub(r"(\s)i(?!·)",lambda m: m.group(1)+"ʲɪ",text) # 5.2.4.2 iii text=re.sub(r"_i(?!·)","ʲɪ",text) # 5.2.4.2 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞̝])i(?![ː·])","ɪ",text) # 5.2.5.2 i for rf in retroflex: text = re.sub(r"ɪ(?=" + rf + ")","ɪ˞",text) # Long i text=re.sub(r"I(\s)",lambda m: "i·"+m.group(1),text) # 5.2.2.2 v text=re.sub(r"(\s)I(?!·)",lambda m: m.group(1)+"ʲiː",text) # 5.2.2.2 iii text=re.sub(r"_I(?!·)","ʲiː",text) # 5.2.2.2 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞̝])I(?![ː·])","iː",text) # 5.2.2.2 i # Long A text=re.sub(r"(\s)A(?!·)",lambda m: m.group(1)+"ˀɑː",text) # 5.2.1.2 iii text=re.sub(r"_A(?!·)","ˀɑː",text) # 5.2.1.2 iii - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞̝])A(?![ː·])","ɑː",text) # 5.2.1.2 i # short a # Transcription of Abbreviation Ignored text=re.sub(r"(\s)a(?!·)",lambda m: m.group(1)+"ˀʌ",text) # 5.2.1.1 vi text=re.sub(r"_a(?!·)","ˀʌ",text) # 5.2.1.1 vi - puththi_Ottum text=re.sub(r"(?<![aAiIuUeEoOː·̞̝])a(?![ː·])","ʌ",text) # 5.2.1.1 i text=re.sub(r"ʌ(\s)",lambda m: "ə"+m.group(1),text) # 5.2.1.1 iii for rf in retroflex: text = re.sub(r"ʌ(?=" + rf + ")","ʌ˞",text) # Aytham text = text.replace("K","x") text = text.replace("xt̪","xð") text = text.replace("xk","xx") # Adding extra symbol for Retroflex Consonants Common of U & O # Regex won't accept (?=[Multiple Unicode Chars]) so separating each charcter for rf in retroflex: text = re.sub(r"ː(?=" + rf + ")","˞ː",text) # Fixing kaaZBbu kaLaip kaLaippu bugs text = text.replace("βp","pp") text = text.replace("β ","p ") text = re.sub(r"ʧ([ \n])s",lambda m: "ʧ"+m.group(1)+"ʧ",text) # ac samayam -> ac camayam \\ Check with Newlines ac \n samayam # New IPA Convensions text = text.replace("ː","ː") text = text.replace("·","ˑ") # text = text.replace("ʧ","tʃ") text = text.replace("ʤ","dʒ") #ɨɭ (i- + Refelex -> <u> <Retroflex> text = text.lstrip() return text
# end of def ipa(text): # Converts a narrow transcription to a broad transcription
[docs]def broad(text): # Remove Palatalization, #text = text.replace("ʲ","ʷ",) Labialization & Glottalization #text = text.replace("ʷ","") text = text.replace("˞","") # Remove Retroflexion of Vowels text = text.replace("ʼ","") # Remove ejectives # Remove vowel position text = text.replace("̝","") text = text.replace("̞","") text = text.replace("̩","") text = text.replace("ˀ","") text = text.replace("ɨ","ʉ") # Replacing narrow transcriptions of Consonants text = text.replace("ɣ","g") # Voiced Velar Fricative to Voiced Velar Plosive text = text.replace("β","b") # Voiced Bilabial Fricative to Voiced Bilabial Plosive text = text.replace("ç","x") # Voiceless Palatal Fricative to Voiceless Velar Fricative text = text.replace("ʊ̯","ʋ") # Non-syllabic near-close near-back rounder vowel to Labio-velar approximant text = text.replace("ð","d̪") # Voiced Dental Fricative to Voiced dental plosive text = text.replace("ɽ","ɖ") # Retroflex flap to Voiced Retroflex plosive text = text.replace("c","k") # Voiceless palatal plosive to Voiceless velar plosive text = text.replace("n̺","n") text = text.replace("x","g") text = text.replace("ŋʲ","ŋ") text = text.replace("gʲ","g") text = text.replace("kʲ","k") text = text.replace("ɪ̯","j") text = text.replace("ʌ","ə") return text
# end of def broad(text):