python: มอดูลน่าสนใจ

รวมมอดูลที่น่าสนใจ

Log in or register to post comments
6134 reads

Topic:

python: แปลง Singular/Plural

ใช้แปลงคำนามระหว่าง Singular/Plural รุ่นนี้ใช้ได้กับภาษาอังกฤษและเสปน ดาวน์โหลดได้ที่ http://www.bermi.org/downloads

$ wget http://www.bermi.org/downloads/python_inflector-0.1.tar.gz

$ tar xfz python_inflector-0.1.tar.gz

$ cd python_inflector-0.1

$ python

Python 2.5.2 (r252:60911, Jul 31 2008, 07:39:27) 
[GCC 4.3.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import Inflector
>>> inf = Inflector.English()
>>> inf.singularize('oxen')
'ox'
>>> inf.pluralize('sheep')
'sheep'
>>>

ยังมีอีกหลายฟังก์ชั่น เอาไว้ค่อยศึกษาอีกที update
ลองใช้แล้ว ไม่ค่อยดีเท่าไหร่ กฎต่าง ๆ ยังขาดความสมบูรณ์อีกมาก
ลองปรับปรุงเล็กน้อยเฉพาะส่วนของ Singular/Plural ให้พอใช้งานจริงได้
(แต่ก็ยังไม่ดีอยู่ดี)
$ vi Rules/English.py

# Copyright (c) 2006 Bermi Ferrer Martinez
# info at bermi dot org
# See the end of this file for the free software, open source license (BSD-style).

import re
from Base import Base

class English (Base):
    """
    Inflector for pluralize and singularize English nouns.
    
    This is the default Inflector for the Inflector obj
    """
    
    irregular_words = {
        'person' : 'people',
        'man' : 'men',
        'child' : 'children',
        'sex' : 'sexes',
        'foot' : 'feet',
        'goose' : 'geese',
        'tooth' : 'teeth',
    }

    reserve_words = {
        'ox' : 'oxen',
        'genus': 'genera',
        'corpus': 'corpora',
        'concerto': 'concerti',
        'bus' : 'buses',
        'virus' : 'viruses',
        'apparatus': 'apparatuses',
        'die' : 'dice',
        'this' : 'these',
        'that' : 'those',
        'quiz' : 'quizzes',
        'polka' : 'polkas',
        'passerby' : 'passersby',
        'sarcoma' : 'sarcomata', 
        'schema' : 'schemata', 
        'stigma' : 'stigmata', 
        'stoma' : 'stomata', 
        'cherub' : 'cherubim',
        'kibbutz' : 'kibbutzim',
        'seraph' : 'seraphim',
        'mum' : 'mums',
        'boa' : 'boe',
    }
        
    uncountable_words = ['aircraft','equipment', 'information', 'rice', \
        'money', 'species', 'series', 'fish', 'sheep', 'shrimp', 'sms', \
        'moose', 'bison', 'deer', 'means', 'scissors', 'species', 'swine', \
        'salmon',
        'meerschuam',
        'scum','talcum',
        'offspring',
        'sorghum','wampum',
        'phoenix',
        ]
    
        
    def pluralize(self, word) :
        '''Pluralizes English nouns.'''
        
        #get rules from 
        #http://web2.uvcs.uvic.ca/elc/studyzone/330/grammar/irrplu.htm
        #http://www2.gsu.edu/~wwwesl/egw/pluralsn.htm
        rules = [
            ['(?i)eau$' , 'eaux'],
            ['(?i)menon$' , 'mena'],
            ['(?i)terion$' , 'teria'],
            ['(?i)(m|[^b]l)ouse$' , '\\1ice'],
            ['(?i)(d|l|r)ix$' , 'ices'],
            ['(?i)(d|p|t)ex$' , 'ices'],
            ['(?i)(ar|f|oo)f$' , '\\1fs'],
            ['(?i)(f|fe)$' , 'ves'],
            ['(?i)(pian|sol|temp)o$' , '\\1os'],
            ['(?i)(a|e|i|o|u|n)o$' , '\\1os'],
            ['(?i)(o|x|z|ch|ss|sh)$' , '\\1es'],
            ['(?i)-in-law$' , 's-in-law'],
            ['(?i)ful$' , 'sful'],
            ['(?i)is$' , 'es'],
            ['(?i)(d|(a|e|i|o|u)n|p|pl|r|s|t)us$' , '\\1uses'],
            ['(?i)(bu)s$' , '\\1ses'],
            ['(?i)us$' , 'i'],
            ['(?i)(a|b|dumd|e|g|h|k|(d|l|p|s|sy)l|o|r|s)um$' , '\\1ums'],
            ['(?i)um$' , 'a'],
            ['(?i)(a|e|i|o|u)a$' , '\\1as'],
            ['(?i)a$' , 'ae'],
            ['(?i)([^aeiouy]|qu)y$' , '\\1ies'],
            ['(?i)s$' , 'ses'],
            ['(?i)$' , 's']
        ]
        
        lower_cased_word = word.lower();
        
        for uncountable_word in self.uncountable_words:
            if lower_cased_word[-1*len(uncountable_word):] == uncountable_word :
                return word
        
        for irregular in self.irregular_words.keys():
            match = re.search('('+irregular+')$',word, re.IGNORECASE)
            if match:
                return re.sub('(?i)'+irregular+'$', match.expand('\\1')[0]\
                    +self.irregular_words[irregular][1:], word)

        for res_word in self.reserve_words.keys():
            if res_word == word:
                return self.reserve_words[res_word]
        
        for rule in range(len(rules)):
            match = re.search(rules[rule][0], word, re.IGNORECASE)
            if match :
                groups = match.groups()
                for k in range(0,len(groups)) :
                    if groups[k] == None :
                        rules[rule][1] = rules[rule][1].replace('\\'+str(k+1), '')
                        
                return re.sub(rules[rule][0], rules[rule][1], word)
        
        return word


    def singularize (self, word) :
        '''Singularizes English nouns.'''
        
        rules = [
            ['(?i)eaux$' , 'eau'],
            ['(?i)mena$' , 'menon'],
            ['(?i)teria$' , 'terion'],
            ['(?i)([m|l])ice$' , '\\1ouse'],
            ['(?i)ices$' , '|ix|ex'],
            ['(?i)ves$' , '|f|fe'],
            ['(?i)os$' , 'o'],
            ['(?i)(o|x|z|ch|ss|sh)es$' , '\\1'],
            ['(?i)s-in-law$' , '-in-law'],
            ['(?i)sful$' , 'ful'],
            ['(?i)(d|(a|e|i|o|u)n|p|pl|r|s|t)uses$' , '\\1us'],
            ['(?i)(a|b|dumd|e|g|h|k|(d|l|p|s|sy)l|o|r|s)ums$' , '\\1um'],
            ['(?i)a$' , 'um'],
            ['(?i)i$' , 'us'],
            ['(?i)ses$' , 's'],
            ['(?i)es$' , 'is'],
            ['(?i)(a|e|i|o|u)as$' , '\\1a'],
            ['(?i)ae$' , 'a'],
            ['(?i)ies$' , 'y'],
            ['(?i)s$' , '']
        ]
    
        irregular_words = dict(
            [[self.irregular_words[i],i] for i in self.irregular_words.keys()] )
    
        reserve_words = dict(
            [[self.reserve_words[i],i] for i in self.reserve_words.keys()] )

        lower_cased_word = word.lower();
    
        for uncountable_word in self.uncountable_words:
            if lower_cased_word[-1*len(uncountable_word):] == uncountable_word :
                return word
            
        for irregular in irregular_words.keys():
            match = re.search('('+irregular+')$',word, re.IGNORECASE)
            if match:
                return re.sub('(?i)'+irregular+'$', match.expand('\\1')[0]+irregular_words[irregular][1:], word)
            
        for res_word in self.reserve_words.keys():
            if res_word == word:
                return self.reserve_words[res_word]

        for rule in range(len(rules)):
            match = re.search(rules[rule][0], word, re.IGNORECASE)
            if match :
                groups = match.groups()
                for k in range(0,len(groups)) :
                    if groups[k] == None :
                        rules[rule][1] = rules[rule][1].replace('\\'+str(k+1), '')
                        
                return re.sub(rules[rule][0], rules[rule][1], word)
        
        return word
    


# Copyright (c) 2006 Bermi Ferrer Martinez
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software to deal in this software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this software, and to permit
# persons to whom this software is furnished to do so, subject to the following
# condition:
#
# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THIS SOFTWARE.

Log in or register to post comments
8702 reads

Topic:

python

module