nltk 更适合处理英文
分词
from nltk.tokenize import word_tokenize
tokens = word_tokenize(input_str)
tokens = [word.lower() for word in tokens]
tokens[:5]
处理 html 标签等字符
import nltk
clean = nltk.clean_html(html)
tokens = [tok for tok in clean.split()]
可以使用正则,但这个更方便
查看频率分布
freq_dist = nltk.FreqDist(tokens)
for k,v in freq_dist.items():
print(k, ': ', v)
Today : 1
's : 1
weather : 1
is : 1
Text对象
from nltk.text import Text
help(nltk.text)
t = Text(tokens)
t.count('code')
t.index('code')
%matplotlib inline
t.plot(8)
停用词过滤
加载停用词
from nltk.corpus import stopwords
stopwords.readme().replace('\n', ' ')
'Stopwords Corpus This corpus contains lists of stop words for several languages. These are high-frequency grammatical words which are usually ignored in text retrieval applications. They were obtained from: http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ The stop words for the Romanian language were obtained from: http://arlc.ro/resources/ The English list has been augmented https://github.com/nltk/nltk_data/issues/22 The German list has been corrected https://github.com/nltk/nltk_data/pull/49 A Kazakh list has been added https://github.com/nltk/nltk_data/pull/52 A Nepali list has been added https://github.com/nltk/nltk_data/pull/83 An Azerbaijani list has been added https://github.com/nltk/nltk_data/pull/100 A Greek list has been added https://github.com/nltk/nltk_data/pull/103 An Indonesian list has been added https://github.com/nltk/nltk_data/pull/112 '
stopwords.fileids()
['arabic',
'azerbaijani',
'danish',
'dutch',
'english',
'finnish',
'french',
'german',
'greek',
'hungarian',
'indonesian',
'italian',
'kazakh',
'nepali',
'norwegian',
'portuguese',
'romanian',
'russian',
'slovene',
'spanish',
'swedish',
'tajik',
'turkish']
查看英文停用词表
stopwords.raw('english').replace('\n', ' ')
"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't "
test_words = [word.lower() for word in tokens]
test_words_set = set(test_words)
test_words
['browse',
'the',
'latest',
'developer',
'documentation',
',',
'including',
'tutorials',
',',
'sample',
'code',
',',
'articles',
',',
'and',
'api',
'reference',
'.']
test_words_set
{',',
'.',
'and',
'api',
'articles',
'browse',
'code',
'developer',
'documentation',
'including',
'latest',
'reference',
'sample',
'the',
'tutorials'}
查看和停用词表的交集
stopwords_english = set(stopwords.words('english'))
test_words_set.intersection(stopwords_english)
{'and', 'the'}
把停用词过滤掉
filtered = [w for w in test_words_set if(w not in stopwords_english) ]
filtered
['documentation',
'api',
'tutorials',
'articles',
'.',
'including',
'latest',
'code',
'sample',
'developer',
',',
'reference',
'browse']
词性标注 pos_tag
nltk 工具包中的 averaged_perceptron
from nltk import pos_tag
tags = pos_tag(tokens)
[('browse', 'VB'),
('the', 'DT'),
('latest', 'JJS'),
('developer', 'NN'),
('documentation', 'NN'),
(',', ','),
('including', 'VBG'),
('tutorials', 'NNS'),
(',', ','),
('sample', 'NN'),
('code', 'NN'),
(',', ','),
('articles', 'NNS'),
(',', ','),
('and', 'CC'),
('api', 'JJ'),
('reference', 'NN'),
('.', '.')]
分块
将具有特定成分的内容拿出来
from nltk.chunk import RegexpParser
sentence = [('the', 'DT'), ('little', 'JJ'), ('yellow', 'JJ'),
('dog', 'NN'), ('died', 'VBD')]
grammer = 'MY_NP:{<DT>?<JJ>*<NN>}'
cp = RegexpParser(grammer)
result = cp.parse(sentence)
print(result)
(S (MY_NP the/DT little/JJ yellow/JJ dog/NN) died/VBD)
result.draw()
命名实体识别
相关库: maxent_ne_chunke, words
分词 得到词性
from nltk import ne_chunk
str = 'The Apple Developer Program provides everything you need to build and distribute your apps on the Mac App Store. '
tokens = word_tokenize(str)
tags = pos_tag(tokens)
print(ne_chunk(tags))
(S
The/DT
(ORGANIZATION Apple/NNP Developer/NNP Program/NNP)
provides/VBZ
everything/NN
you/PRP
need/VBP
to/TO
build/VB
and/CC
distribute/VB
your/PRP$
apps/NN
on/IN
the/DT
(ORGANIZATION Mac/NNP App/NNP Store/NNP)
./.)
tags
[('The', 'DT'),
('Apple', 'NNP'),
('Developer', 'NNP'),
('Program', 'NNP'),
('provides', 'VBZ'),
('everything', 'NN'),
('you', 'PRP'),
('need', 'VBP'),
('to', 'TO'),
('build', 'VB'),
('and', 'CC'),
('distribute', 'VB'),
('your', 'PRP$'),
('apps', 'NN'),
('on', 'IN'),
('the', 'DT'),
('Mac', 'NNP'),
('App', 'NNP'),
('Store', 'NNP'),
('.', '.')]
数据清洗
- 去掉多余空格
- 去掉不需要特殊字符
- 去掉一些网站等没用的东西
使用正则,stopwords
import re
from nltk.corpus import stopwords
s = ' RT @Amila #Test\nTom\'s newly listed Co & Mary\'s unlisted Group to supply tech for nlTK.\nh $TSLA $AAPL https:// t.co/x34afsfQsh'
cache_english_stopwords = stopwords.words('english')
def text_clean(text):
print('原始数据:', text, '\n')
text_no_special_entities = re.sub(r'\&\w*;|#\w*|@\w*', '', text)
print('去掉特殊标签后的:', text_no_special_entities, '\n')
text_no_tickers = re.sub(r'\$\w*', '', text_no_special_entities)
print('去掉价值符号后的:', text_no_tickers, '\n')
text_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', text_no_tickers)
print('去掉超链接后的:', text_no_hyperlinks, '\n')
text_no_small_words = re.sub(r'\b\w{1,2}\b', '', text_no_hyperlinks)
print('去掉专门名词缩写后:', text_no_small_words, '\n')
text_no_whitespace = re.sub(r'\s\s+', ' ', text_no_small_words)
text_no_whitespace = text_no_whitespace.lstrip(' ')
print('去掉空格后的:', text_no_whitespace, '\n')
tokens = word_tokenize(text_no_whitespace)
print('分词结果:', tokens, '\n')
list_no_stopwords = [i for i in tokens if i not in cache_english_stopwords]
print('去停用词后结果:', list_no_stopwords, '\n')
text_filtered =' '.join(list_no_stopwords)
print('过滤后:', text_filtered)
text_clean(s)
|