Source code for libindic.stemmer

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions email:
# santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in

import os

import marisa_trie
from rreplace import rreplace


[docs]class Malayalam: """ Malayalam Stemmer class. """ def __init__(self): self.rules_file = os.path.join( os.path.dirname(__file__), 'data/ml.rules') self.rulesDict = None self.dictionary_file = open(os.path.join( os.path.dirname(__file__), 'data/rootwords.txt')) self.dictionary = self.dictionary_file.readlines() self.dictionary_file.close() try: self.dictionary = marisa_trie.Trie([x.strip().decode('utf-8') for x in self.dictionary]) except: self.dictionary = marisa_trie.Trie( [x.strip() for x in self.dictionary])
[docs] def singleencode(self, word): ''' Normalize word to single encoding. ''' replace = {'\\u0d15\\u0d4d\\u200d': '\\u0d7f', '\\u0d23\\u0d4d\\u200d': '\\u0d7a', '\\u0d28\\u0d4d\\u200d': '\\u0d7b', '\\u0d30\\u0d4d\\u200d': '\\u0d7c', '\\u0d32\\u0d4d\\u200d': '\\u0d7d', '\\u0d33\\u0d4d\\u200d': '\\u0d7e'} for character in replace: word = word.replace(character, replace[character]) return word
[docs] def stem(self, text): """ :param text: Malayalam string :returns: Dictionary with words of the string as keys and their \ corresponding stems as values. """ if self.rulesDict is None: self.rulesDict = self.LoadRules() words = text.split(" ") word_count = len(words) result_dict = dict() word_iter = 0 word = "" while word_iter < word_count: word = words[word_iter] original_word = words[word_iter] word = self.trim(word) word = word.strip('!,.?:') try: result = self.trim(word).decode('utf-8') except: result = word result = self.singleencode(result) word = result if result in self.dictionary: result_dict[original_word] = result word_iter += 1 continue found = True while found: # For multilevel inflection handling. # Repeats stemming until # (i) a mis-hit in rules or # (ii) a rootword is found # 'found' variable is used to detect mis-hit, for # each intermediate form of word found = False counter = 1 if result in self.dictionary: result_dict[original_word] = result break while counter < len(result): suffix = result[counter:] # Right to left suffix stripping if suffix in self.rulesDict: result = result[:counter] + self.rulesDict[suffix] # A satisfying rule found, continue stemming. found = True break counter = counter + 1 # Stop stemming, no matching rules found - probably a root # word. word_iter += 1 result_dict[original_word] = result return result_dict
def LoadRules(self): rules_dict = dict() rules_file_object = open(self.rules_file) rules_text = rules_file_object.readlines() rules_file_object.close() rules_dict = {} for line in rules_text: if line == '' or line[0] == '#': continue items = line.strip().split('=') try: try: lhs = items[0].strip().strip( '"').strip("'").decode('utf-8') rhs = items[1].strip().strip( '"').strip("'").decode('utf-8') except: lhs = items[0].strip().strip('"').strip("'") rhs = items[1].strip().strip('"').strip("'") lhs = self.singleencode(lhs) rhs = self.singleencode(rhs) rules_dict[lhs] = rhs except: continue return rules_dict def trim(self, word): punctuations = ['~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', '_', '=', '{', '}', '|', ':', ';', '<', '>', '\,', '.', '?'] word = word.strip() index = len(word) - 1 while index > 0: if word[index] in punctuations: word = word[0:index] else: break index = index - 1 return word
[docs] def get_module_name(self): """ returns the module name. """ return "Stemmer"
[docs] def get_info(self): """ returns info on the module """ return "Malayalam Stemmer(Experimental)"
[docs]class Hindi: ''' Hindi Stemmer Class ''' def stem(self, text): suffixes = {1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"], 2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"], 3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं", "ाएँ"], 4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"], 5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"]} tag = [1, 2, 3, 4, 5] tag.reverse() dic_hi = {} for word in text.split(): try: word = word.decode("utf-8") except: pass flag = 0 for L in tag: if flag == 1: break if len(word) > L + 1: for suf in suffixes[L]: suf = suf.decode("utf-8") if word.endswith(suf): word1 = rreplace(word, suf, '', 1) dic_hi[word] = word1 flag = 1 break if flag == 0: dic_hi[word] = word return dic_hi
[docs]class Punjabi: ''' Punjabi Stemmer Class ''' def stem(self, text): suffixes = {1: ["ੀ ਆਂ ", "िਆਂ", "ੂਆਂ", "ੀ ਏ", "ੀ ਓ"], 2: ["ਈ", "ੇ", "ू", "ु", "ी", "ि", "ा", "ੋ", "ਜ", "ਜ਼", "ਸ"], 3: ["िਓ", "ਾ ਂ", "ੀ ਂ", "ੋ ਂ"], 4: ["ਿਉ ਂ", "ਵਾਂ", "ੀ ਆ", "िਆ", "ਈਆ"], 5: ["ੀ ਆ", "िਆ", "ਈਆ"]} tag = [1, 2, 3, 4, 5] tag.reverse() dic_hi = {} for word in text.split(): flag = 0 word = word.decode("utf-8") for L in tag: if flag == 1: break if len(word) > L + 1: if L == 5 or L == 1: for suf in suffixes[L]: suf = suf.decode("utf-8") if word.endswith(suf): word1 = rreplace(word, suf[1:], '', 1) dic_hi[word] = word1 flag = 1 break else: for suf in suffixes[L]: suf = suf.decode("utf-8") if word.endswith(suf): word1 = rreplace(word, suf, '', 1) dic_hi[word] = word1 flag = 1 break if flag == 0: dic_hi[word] = word return dic_hi
def getInstance(target_language): if target_language.lower() == 'ml_in': return Malayalam()