3 Star 8 Fork 1

冰封飞飞 / 计算机英语词频统计

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
wordStandardized.py 3.28 KB
一键复制 编辑 原始数据 按行查看 历史
'''
英语单词词性还原
修改自https://github.com/rocketk/wordcounter
'''
'''
Copyright [rocketk]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import sys,re,collections,nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from utils import costCount
# patterns that used to find or/and replace particular chars or words
# to find chars that are not a letter, a blank or a quotation
pat_letter = re.compile(r'[^a-zA-Z \']+')
# to find the 's following the pronouns. re.I is refers to ignore case
pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
# to find the 's following the letters
pat_s = re.compile("(?<=[a-zA-Z])\'s")
# to find the ' following the words ending by s
pat_s2 = re.compile("(?<=s)\'s?")
# to find the abbreviation of not
pat_not = re.compile("(?<=[a-zA-Z])n\'t")
# to find the abbreviation of would
pat_would = re.compile("(?<=[a-zA-Z])\'d")
# to find the abbreviation of will
pat_will = re.compile("(?<=[a-zA-Z])\'ll")
# to find the abbreviation of am
pat_am = re.compile("(?<=[I|i])\'m")
# to find the abbreviation of are
pat_are = re.compile("(?<=[a-zA-Z])\'re")
# to find the abbreviation of have
pat_ve = re.compile("(?<=[a-zA-Z])\'ve")
lmtzr = WordNetLemmatizer()
@costCount
def standardizedWords(text):
'''
将单词转换为标准化的形式
'''
words_box=[]
words_box.extend(merge(replace_abbreviations(text).split()))
return ' '.join(words_box)
@costCount
def merge(words):
new_words = []
for word in words:
if word:
tag = nltk.pos_tag(word_tokenize(word)) # tag is like [('bigger', 'JJR')]
pos = get_wordnet_pos(tag[0][1])
if pos:
lemmatized_word = lmtzr.lemmatize(word, pos)
new_words.append(lemmatized_word)
else:
new_words.append(word)
return new_words
@costCount
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return nltk.corpus.wordnet.ADJ
elif treebank_tag.startswith('V'):
return nltk.corpus.wordnet.VERB
elif treebank_tag.startswith('N'):
return nltk.corpus.wordnet.NOUN
elif treebank_tag.startswith('R'):
return nltk.corpus.wordnet.ADV
else:
return ''
@costCount
def replace_abbreviations(text):
new_text = text
new_text = pat_letter.sub(' ', text).strip().lower()
new_text = pat_is.sub(r"\1 is", new_text)
new_text = pat_s.sub("", new_text)
new_text = pat_s2.sub("", new_text)
new_text = pat_not.sub(" not", new_text)
new_text = pat_would.sub(" would", new_text)
new_text = pat_will.sub(" will", new_text)
new_text = pat_am.sub(" am", new_text)
new_text = pat_are.sub(" are", new_text)
new_text = pat_ve.sub(" have", new_text)
new_text = new_text.replace('\'', ' ')
return new_text
Python
1
https://gitee.com/bingfengfeifei/wordCount.git
git@gitee.com:bingfengfeifei/wordCount.git
bingfengfeifei
wordCount
计算机英语词频统计
master

搜索帮助