Source code for tamil.utils.TextSummaryExtractor

#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import re
import tamil

# This file was originally part of  'TamilNLP' repackaged for Open-Tamil.
# [தமிழ் உரை சுருக்கம் செய்யும் நிரல்](https://github.com/AshokR/TamilNLP/wiki/Text-Summary-Extractor)
# Copyright © 2016 இரா. அசோகன்
# Licensed under the Apache License, Version 2.0 (the "License");
# This is a naive text summarization algorithm created by Shlomi Babluki
# Copyright (C) 2013 Shlomi Babluki
# 
# http://thetokenizer.com/2013/04/28/build-your-own-summary-tool/
# https://gist.github.com/shlomibabluki/5473521
# 
# Copyright (C) 2016 Muthu Annamalai

[docs]class SummaryTool(object): # Naive method for splitting a text into sentences
[docs] def split_content_to_sentences(self, content): content = content.replace("\n", ". ") return content.split(". ")
# Naive method for splitting a text into paragraphs
[docs] def split_content_to_paragraphs(self, content): return content.split("\n\n")
# Caculate the intersection between 2 sentences
[docs] def sentences_intersection(self, sent1, sent2): # split the sentence into words/tokens # s1 = set(sent1.split(" ")) # s2 = set(sent2.split(" ")) s1 = set(tamil.utf8.get_letters(sent1)) s2 = set(tamil.utf8.get_letters(sent2)) # If there is not intersection, just return 0 # if (len(s1) + len(s2)) == 0: if len(s1.intersection(s2)) == 0: return 0 # We normalize the result by the average number of words return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2.0)
# Format a sentence - remove all non-alphbetic chars from the sentence # We'll use the formatted sentence as a key in our sentences dictionary
[docs] def format_sentence(self, sentence): # sentence = re.sub(r'\W+', '', sentence) # [\u0B80-\u0BFF] sentence = re.sub(r'\s+', '', sentence) sentence = re.sub(r'\d+','',sentence) # print sentence return sentence
# Convert the content into a dictionary <K, V> # k = The formatted sentence # V = The rank of the sentence
[docs] def get_sentences_ranks(self, content): # Split the content into sentences sentences = self.split_content_to_sentences(content) # Calculate the intersection of every two sentences n = len(sentences) values = [[0 for x in range(n)] for x in range(n)] for i in range(0, n): for j in range(0, n): # Metric for intersection is symmetric so we calculate 1/2 only # For additional metrics see: ngram.Distance module in open-tamil # Ref https://github.com/Ezhil-Language-Foundation/open-tamil/blob/master/ngram/Distance.py if i >= j : values[i][j] = values[j][i] continue values[i][j] = self.sentences_intersection(sentences[i], sentences[j]) # Build the sentences dictionary # The score of a sentences is the sum of all its intersection sentences_dic = {} for i in range(0, n): score = 0 for j in range(0, n): if i == j: continue score += values[i][j] kw = self.format_sentence(sentences[i]) if len(kw) != 0: sentences_dic[kw] = score return sentences_dic
# Return the best sentence in a paragraph
[docs] def get_best_sentence(self, paragraph, sentences_dic): # Split the paragraph into sentences sentences = self.split_content_to_sentences(paragraph) # Ignore short paragraphs if len(sentences) < 2: return "" # Get the best sentence according to the sentences dictionary best_sentence = "" max_value = 0 for s in sentences: strip_s = self.format_sentence(s) if strip_s: if sentences_dic[strip_s] > max_value: max_value = sentences_dic[strip_s] best_sentence = s return best_sentence
# Build the summary
[docs] def get_summary(self, title, content, sentences_dic): # Split the content into paragraphs paragraphs = self.split_content_to_paragraphs(content) # Add the title summary = [] summary.append(title.strip()) summary.append("") # Add the best sentence from each paragraph for p in paragraphs: sentence = self.get_best_sentence(p, sentences_dic).strip() if sentence: summary.append(sentence) return ("\n").join(summary)