Source code for tamil.tweetparser

# This Python file uses the following encoding: utf-8
# (C) 2013-2016 Muthiah Annamalai
# 
# This file is now part of open-tamil project
# 
# N.B.: Originally written to demo the Python Twitter API for a lightning talk
# at the Boston Python group on 30th July, 2013.

import unicodedata
import re

[docs]class TweetParser: def __init__(self,timeline_owner,tweet): self.Owner = timeline_owner; self.tweet = tweet; self.RT = False; self.tweet = tweet; self.UserHandles = TweetParser.getUserHandles(tweet); self.Hashtags = TweetParser.getHashtags(tweet); self.URLs = TweetParser.getURLs(tweet); self.RT = TweetParser.getAttributeRT(tweet); self.MT = TweetParser.getAttributeMT(tweet); # additional intelligence if ( self.RT and len(self.UserHandles) > 0 ): #change the owner of tweet? self.Owner = self.UserHandles[0]; return def __str__(self): return "owner %s, urls: %d, hashtags %d, user_handles %d, len_tweet %d, RT = %s, MT = %s"%(self.Owner,len(self.URLs),len(self.Hashtags),len(self.UserHandles), len(self.tweet), self.RT,self.MT)
[docs] @staticmethod def getAttributeRT( tweet ): """ see if tweet is a RT """ return re.search(r'^RT',tweet.strip()) != None
[docs] @staticmethod def getAttributeMT( tweet ): """ see if tweet is a MT """ return re.search(r'^MT',tweet.strip()) != None
[docs] @staticmethod def getUserHandles( tweet ): """ given a tweet we try and extract all user handles in order of occurrence""" return re.findall(r'(@[a-zA-Z0-9_]+)',tweet);
[docs] @staticmethod def getHashtags( tweet ): """ return all hashtags""" return re.findall(r'(#[\w\d]+)',tweet);
[docs] @staticmethod def getURLs( tweet ): """ URL : [http://]?[\w\.?/]+""" return re.findall(r'([http://]?[a-zA-Z\d\/]+[\.]+[a-zA-Z\d\/\.]+)',tweet);
[docs]class TamilTweetParser( TweetParser ): def __init__(self,timeline_owner,tweet): TweetParser.__init__(self,timeline_owner,tweet) self.TAWords = TamilTweetParser.getTamilWords(tweet); return def __str__(self): """ tack on the parent """ return TweetParser.__str__(self) + ", TA words = %d "%len(self.TAWords)
[docs] @staticmethod def isTamilPredicate(word): """ is Tamil word : boolean True/False""" for c in word: if unicodedata.name(c).split()[0] != u'TAMIL' : return False return True
[docs] @staticmethod def cleanupPunct( tweet ): """ NonEnglishOrTamilOr """ tweet = ''.join( map( lambda c: (unicodedata.name(c).split()[0] in [u'TAMIL',u'LATIN']) and c or u' ', tweet) ) return tweet
[docs] @staticmethod def getTamilWords( tweet ): """" word needs to all be in the same tamil language """ tweet = TamilTweetParser.cleanupPunct( tweet ); nonETwords = filter( lambda x: len(x) > 0 , re.split(r'\s+',tweet) );#|"+|\'+|#+ tamilWords = filter( TamilTweetParser.isTamilPredicate, nonETwords ); return tamilWords