Source code for libindic.stemmer

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions email:
# santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in

import os

import marisa_trie
from rreplace import rreplace


[docs]class Malayalam:
    """
    Malayalam Stemmer class.
    """

    def __init__(self):
        self.rules_file = os.path.join(
            os.path.dirname(__file__), 'data/ml.rules')
        self.rulesDict = None
        self.dictionary_file = open(os.path.join(
            os.path.dirname(__file__), 'data/rootwords.txt'))
        self.dictionary = self.dictionary_file.readlines()
        self.dictionary_file.close()
        try:
            self.dictionary = marisa_trie.Trie([x.strip().decode('utf-8')
                                                for x in self.dictionary])
        except:
            self.dictionary = marisa_trie.Trie(
                [x.strip() for x in self.dictionary])

[docs]    def singleencode(self, word):
        '''
        Normalize word to single encoding.
        '''
        replace = {'\\u0d15\\u0d4d\\u200d': '\\u0d7f',
                   '\\u0d23\\u0d4d\\u200d': '\\u0d7a',
                   '\\u0d28\\u0d4d\\u200d': '\\u0d7b',
                   '\\u0d30\\u0d4d\\u200d': '\\u0d7c',
                   '\\u0d32\\u0d4d\\u200d': '\\u0d7d',
                   '\\u0d33\\u0d4d\\u200d': '\\u0d7e'}
        for character in replace:
            word = word.replace(character, replace[character])
        return word

[docs]    def stem(self, text):
        """
        :param text: Malayalam string
        :returns: Dictionary with words of the string as keys and their \
        corresponding stems as values.
        """
        if self.rulesDict is None:
            self.rulesDict = self.LoadRules()
        words = text.split(" ")
        word_count = len(words)
        result_dict = dict()
        word_iter = 0
        word = ""
        while word_iter < word_count:
            word = words[word_iter]
            original_word = words[word_iter]
            word = self.trim(word)
            word = word.strip('!,.?:')
            try:
                result = self.trim(word).decode('utf-8')
            except:
                result = word
            result = self.singleencode(result)
            word = result
            if result in self.dictionary:
                result_dict[original_word] = result
                word_iter += 1
                continue
            found = True
            while found:
                # For multilevel inflection handling.
                # Repeats stemming until
                # (i) a mis-hit in rules or
                # (ii) a rootword is found
                # 'found' variable is used to detect mis-hit, for
                # each intermediate form of word
                found = False
                counter = 1
                if result in self.dictionary:
                    result_dict[original_word] = result
                    break
                while counter < len(result):
                    suffix = result[counter:]  # Right to left suffix stripping
                    if suffix in self.rulesDict:
                        result = result[:counter] + self.rulesDict[suffix]
                        # A satisfying rule found, continue stemming.
                        found = True
                        break
                    counter = counter + 1
                # Stop stemming, no matching rules found - probably a root
                # word.
            word_iter += 1
            result_dict[original_word] = result
        return result_dict

    def LoadRules(self):
        rules_dict = dict()
        rules_file_object = open(self.rules_file)
        rules_text = rules_file_object.readlines()
        rules_file_object.close()
        rules_dict = {}
        for line in rules_text:
            if line == '' or line[0] == '#':
                continue
            items = line.strip().split('=')
            try:
                try:
                    lhs = items[0].strip().strip(
                        '"').strip("'").decode('utf-8')
                    rhs = items[1].strip().strip(
                        '"').strip("'").decode('utf-8')
                except:
                    lhs = items[0].strip().strip('"').strip("'")
                    rhs = items[1].strip().strip('"').strip("'")
                lhs = self.singleencode(lhs)
                rhs = self.singleencode(rhs)
                rules_dict[lhs] = rhs
            except:
                continue
        return rules_dict

    def trim(self, word):
        punctuations = ['~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')',
                        '-', '+', '_', '=', '{', '}', '|', ':', ';', '<', '>',
                        '\,', '.', '?']
        word = word.strip()
        index = len(word) - 1
        while index > 0:
            if word[index] in punctuations:
                word = word[0:index]
            else:
                break
            index = index - 1
        return word

[docs]    def get_module_name(self):
        """
        returns the module name.
        """
        return "Stemmer"

[docs]    def get_info(self):
        """
        returns info on the module
        """
        return "Malayalam Stemmer(Experimental)"


[docs]class Hindi:
    '''
    Hindi Stemmer Class
    '''

    def stem(self, text):
        suffixes = {1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
                    2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना",
                        "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
                    3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा",
                        "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता",
                        "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं", "ाएँ"],
                    4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी",
                        "ेंगी", "एंगे", "ेंगे", "ूंगी",
                        "ूंगा", "ातीं", "नाओं",
                        "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
                    5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा",
                        "ाइयाँ", "ाइयों", "ाइयां"]}
        tag = [1, 2, 3, 4, 5]
        tag.reverse()
        dic_hi = {}
        for word in text.split():
            try:
                word = word.decode("utf-8")
            except:
                pass
            flag = 0
            for L in tag:
                if flag == 1:
                    break
                if len(word) > L + 1:
                    for suf in suffixes[L]:
                        suf = suf.decode("utf-8")
                        if word.endswith(suf):
                            word1 = rreplace(word, suf, '', 1)
                            dic_hi[word] = word1
                            flag = 1
                            break
            if flag == 0:
                dic_hi[word] = word
        return dic_hi


[docs]class Punjabi:
    '''
    Punjabi Stemmer Class
    '''

    def stem(self, text):
        suffixes = {1: ["ੀ ਆਂ ", "िਆਂ", "ੂਆਂ", "ੀ ਏ", "ੀ ਓ"],
                    2: ["ਈ", "ੇ", "ू", "ु", "ी",
                        "ि", "ा", "ੋ", "ਜ", "ਜ਼", "ਸ"],
                    3: ["िਓ", "ਾ ਂ", "ੀ ਂ", "ੋ ਂ"],
                    4: ["ਿਉ ਂ", "ਵਾਂ", "ੀ ਆ", "िਆ", "ਈਆ"],
                    5: ["ੀ ਆ", "िਆ", "ਈਆ"]}
        tag = [1, 2, 3, 4, 5]
        tag.reverse()
        dic_hi = {}
        for word in text.split():
            flag = 0
            word = word.decode("utf-8")
            for L in tag:
                if flag == 1:
                    break
                if len(word) > L + 1:
                    if L == 5 or L == 1:
                        for suf in suffixes[L]:
                            suf = suf.decode("utf-8")
                            if word.endswith(suf):
                                word1 = rreplace(word, suf[1:], '', 1)
                                dic_hi[word] = word1
                                flag = 1
                                break
                    else:
                        for suf in suffixes[L]:
                            suf = suf.decode("utf-8")
                            if word.endswith(suf):
                                word1 = rreplace(word, suf, '', 1)
                                dic_hi[word] = word1
                                flag = 1
                                break
            if flag == 0:
                dic_hi[word] = word
        return dic_hi


def getInstance(target_language):
    if target_language.lower() == 'ml_in':
        return Malayalam()