#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions email:
# santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in
import os
import marisa_trie
from rreplace import rreplace
[docs]class Malayalam:
"""
Malayalam Stemmer class.
"""
def __init__(self):
self.rules_file = os.path.join(
os.path.dirname(__file__), 'data/ml.rules')
self.rulesDict = None
self.dictionary_file = open(os.path.join(
os.path.dirname(__file__), 'data/rootwords.txt'))
self.dictionary = self.dictionary_file.readlines()
self.dictionary_file.close()
try:
self.dictionary = marisa_trie.Trie([x.strip().decode('utf-8')
for x in self.dictionary])
except:
self.dictionary = marisa_trie.Trie(
[x.strip() for x in self.dictionary])
[docs] def singleencode(self, word):
'''
Normalize word to single encoding.
'''
replace = {'\\u0d15\\u0d4d\\u200d': '\\u0d7f',
'\\u0d23\\u0d4d\\u200d': '\\u0d7a',
'\\u0d28\\u0d4d\\u200d': '\\u0d7b',
'\\u0d30\\u0d4d\\u200d': '\\u0d7c',
'\\u0d32\\u0d4d\\u200d': '\\u0d7d',
'\\u0d33\\u0d4d\\u200d': '\\u0d7e'}
for character in replace:
word = word.replace(character, replace[character])
return word
[docs] def stem(self, text):
"""
:param text: Malayalam string
:returns: Dictionary with words of the string as keys and their \
corresponding stems as values.
"""
if self.rulesDict is None:
self.rulesDict = self.LoadRules()
words = text.split(" ")
word_count = len(words)
result_dict = dict()
word_iter = 0
word = ""
while word_iter < word_count:
word = words[word_iter]
original_word = words[word_iter]
word = self.trim(word)
word = word.strip('!,.?:')
try:
result = self.trim(word).decode('utf-8')
except:
result = word
result = self.singleencode(result)
word = result
if result in self.dictionary:
result_dict[original_word] = result
word_iter += 1
continue
found = True
while found:
# For multilevel inflection handling.
# Repeats stemming until
# (i) a mis-hit in rules or
# (ii) a rootword is found
# 'found' variable is used to detect mis-hit, for
# each intermediate form of word
found = False
counter = 1
if result in self.dictionary:
result_dict[original_word] = result
break
while counter < len(result):
suffix = result[counter:] # Right to left suffix stripping
if suffix in self.rulesDict:
result = result[:counter] + self.rulesDict[suffix]
# A satisfying rule found, continue stemming.
found = True
break
counter = counter + 1
# Stop stemming, no matching rules found - probably a root
# word.
word_iter += 1
result_dict[original_word] = result
return result_dict
def LoadRules(self):
rules_dict = dict()
rules_file_object = open(self.rules_file)
rules_text = rules_file_object.readlines()
rules_file_object.close()
rules_dict = {}
for line in rules_text:
if line == '' or line[0] == '#':
continue
items = line.strip().split('=')
try:
try:
lhs = items[0].strip().strip(
'"').strip("'").decode('utf-8')
rhs = items[1].strip().strip(
'"').strip("'").decode('utf-8')
except:
lhs = items[0].strip().strip('"').strip("'")
rhs = items[1].strip().strip('"').strip("'")
lhs = self.singleencode(lhs)
rhs = self.singleencode(rhs)
rules_dict[lhs] = rhs
except:
continue
return rules_dict
def trim(self, word):
punctuations = ['~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
'-', '+', '_', '=', '{', '}', '|', ':', ';', '<', '>',
'\,', '.', '?']
word = word.strip()
index = len(word) - 1
while index > 0:
if word[index] in punctuations:
word = word[0:index]
else:
break
index = index - 1
return word
[docs] def get_module_name(self):
"""
returns the module name.
"""
return "Stemmer"
[docs] def get_info(self):
"""
returns info on the module
"""
return "Malayalam Stemmer(Experimental)"
[docs]class Hindi:
'''
Hindi Stemmer Class
'''
def stem(self, text):
suffixes = {1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना",
"ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा",
"ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता",
"तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं", "ाएँ"],
4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी",
"ेंगी", "एंगे", "ेंगे", "ूंगी",
"ूंगा", "ातीं", "नाओं",
"नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा",
"ाइयाँ", "ाइयों", "ाइयां"]}
tag = [1, 2, 3, 4, 5]
tag.reverse()
dic_hi = {}
for word in text.split():
try:
word = word.decode("utf-8")
except:
pass
flag = 0
for L in tag:
if flag == 1:
break
if len(word) > L + 1:
for suf in suffixes[L]:
suf = suf.decode("utf-8")
if word.endswith(suf):
word1 = rreplace(word, suf, '', 1)
dic_hi[word] = word1
flag = 1
break
if flag == 0:
dic_hi[word] = word
return dic_hi
[docs]class Punjabi:
'''
Punjabi Stemmer Class
'''
def stem(self, text):
suffixes = {1: ["ੀ ਆਂ ", "िਆਂ", "ੂਆਂ", "ੀ ਏ", "ੀ ਓ"],
2: ["ਈ", "ੇ", "ू", "ु", "ी",
"ि", "ा", "ੋ", "ਜ", "ਜ਼", "ਸ"],
3: ["िਓ", "ਾ ਂ", "ੀ ਂ", "ੋ ਂ"],
4: ["ਿਉ ਂ", "ਵਾਂ", "ੀ ਆ", "िਆ", "ਈਆ"],
5: ["ੀ ਆ", "िਆ", "ਈਆ"]}
tag = [1, 2, 3, 4, 5]
tag.reverse()
dic_hi = {}
for word in text.split():
flag = 0
word = word.decode("utf-8")
for L in tag:
if flag == 1:
break
if len(word) > L + 1:
if L == 5 or L == 1:
for suf in suffixes[L]:
suf = suf.decode("utf-8")
if word.endswith(suf):
word1 = rreplace(word, suf[1:], '', 1)
dic_hi[word] = word1
flag = 1
break
else:
for suf in suffixes[L]:
suf = suf.decode("utf-8")
if word.endswith(suf):
word1 = rreplace(word, suf, '', 1)
dic_hi[word] = word1
flag = 1
break
if flag == 0:
dic_hi[word] = word
return dic_hi
def getInstance(target_language):
if target_language.lower() == 'ml_in':
return Malayalam()