https://github.com/duoergun0729/nlp
加载url列表
with open("…/data/news_sohusite_url.txt") as F: url = F.readlines() F.close() for index,u in enumerate(url): for k, v in SogouTCE_kv.items():
只加载id为81,79和91的数据,同时注意要过滤掉内容为空的
if re.search(k, u, re.IGNORECASE) and v in (81,79, 91) and len(content[index].strip()) > 1: #保存url对应的content内容 x.append(content[index]) y.append(v) continue return x,y
-- coding: UTF-8 --
import re from fastText import train_supervised import numpy as np import codecs from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.neural_network import MLPClassifier from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from keras.utils import to_categorical from sklearn.preprocessing import OneHotEncoder def load_stopwords(): with open(“stopwords.txt”) as F: stopwords=F.readlines() F.close() return [word.strip() for word in stopwords] def load_SogouTCE(): SogouTCE=[] SogouTCE_kv = {} with open("…/data/SogouTCE.txt") as F: for line in F: (url,channel)=line.split() SogouTCE.append(url) F.close() for index,url in enumerate(SogouTCE): #删除http前缀 url=re.sub(‘http://’,’’,url) print “k:%s v:%d” % (url,index) SogouTCE_kv[url]=index return SogouTCE_kv def load_url(SogouTCE_kv): labels=[] with open("…/data/news_sohusite_url.txt") as F: #with codecs.open("…/data/news_sohusite_url.txt",“r”,encoding=‘utf-8’, errors=‘ignore’) as F: for line in F: for k,v in SogouTCE_kv.items(): if re.search(k,line,re.IGNORECASE): #print “x:%s y:%d” % (line,v) print v labels.append(v) #else:
print “not found %s” %(line)
F.close() return labels def load_selecteddata(SogouTCE_kv): x=[] y=[] #加载content列表 #with codecs.open("…/data/news_sohusite_content.txt", “r”, encoding=‘utf-8’, errors=‘ignore’) as F: with open("…/data/news_sohusite_content.txt") as F: content=F.readlines() F.close()
加载url列表
with open("…/data/news_sohusite_url.txt") as F: url = F.readlines() F.close() for index,u in enumerate(url): for k, v in SogouTCE_kv.items():
只加载id为81,79和91的数据,同时注意要过滤掉内容为空的
if re.search(k, u, re.IGNORECASE) and v in (81,79, 91) and len(content[index].strip()) > 1: #保存url对应的content内容 x.append(conten
https://github.com/fighting41love/funNLP
https://github.com/ShulinCao/OpenNRE-PyTorch
https://github.com/liuhuanyong/ProductKnowledgeGraph
https://github.com/sebastianruder/NLP-progress
https://github.com/sebastianruder/NLP-progress
https://github.com/sebastianruder/NLP-progress
https://github.com/makcedward/nlp
https://github.com/microsoft/nlp-recipes
https://github.com/nlpinaction/learning-nlp
图谱 https://github.com/liuhuanyong/ProductKnowledgeGraph/blob/master/build_kg.py
https://github.com/IntelLabs/nlp-architect
https://github.com/fastnlp/fastNLP
https://github.com/PaddlePaddle/PaddleNLP
重要 https://github.com/PaddlePaddle/PaddleNLP
https://github.com/msgi/nlp-journey
https://github.com/lhyxcxy/nlp
https://github.com/NLP-LOVE/Introduction-NLP
https://github.com/yongzhuo/nlp_xiaojiang
https://github.com/aboSamoor/polyglot
|