TF - IDF
参考:https://mofanpy.com/tutorials/machine-learning/nlp/intro-search/
TF - IDF 用于寻找搜索的最佳匹配。它使用词语的重要程度与独特性来代表每篇文章,然后通过对比搜索词与代表的相似性,提供最相似的文章列表
一篇文章中,越重要的内容,强调的次数也越多,所以频率会越高,词频 TF (Term Frequency)高的词代表着某篇文章的属性, 词w 在 文档d 的TF本质计算 TF = 文档d 中 词w 总数
一些词频高的词没有代表意义,不具有区分力,光看局部信息(某篇文档中的词频 TF)会带来统计偏差, 所以引入了一个全局参数 IDF(Inverse Document Frequency)来判断这个词在系统中的区分力,词w 的IDF本质计算 IDF = log(所有文档数 / 所有文档中 词w 数 )
当然 TF 和 IDF 还有很多种变异的计算方式
最简单的一个搜索引擎,就是计算好所有的文档向量,然后每次来一个搜索问题,机器会利用词表的模式计算问题的 TF-IDF 值,就将这个问题转换成同样的向量,计算问题向量和每篇文章的文档向量的距离,距离越近则文档越相近,下面尝试来实现这个过程:
import numpy as np
from collections import Counter
import itertools
from visual import show_tfidf
docs = [
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup",
]
docs_words = [d.replace(",","").split(" ") for d in docs]
vocab = set(itertools.chain(*docs_words))
v2i = {v: i for i, v in enumerate(vocab)}
i2v = {i: v for v, i in v2i.items()}
idf_methods = {
"log": lambda x: 1 + np.log(len(docs) / (x+1)),
"prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
"len_norm": lambda x: x / (np.sum(np,square(x))+1),
}
def get_idf(method="log"):
df = np.zeros((len(i2v), 1))
for i in range(len(i2v)):
d_count = 0
for d in docs_words:
d_count +=1 if i2v[i] in d else 0
df[i] = d_count
idf_fn = idf_methods.get(method, None)
if idf_fn is None:
raise ValueError
return idf_fn(df)
tf_methods = {
"log": lambda x: np.log(1+x),
"augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
"boolean": lambda x: np.minimum(x, 1),
}
def get_tf(method="log"):
_tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)
for i, d in enumerate(docs_words):
counter = Counter(d)
for v in counter.keys():
_tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
weighted_tf = tf_methods.get(method, None)
if weighted_tf is None:
raise ValueError
return weighted_tf(_tf)
tf = get_tf()
idf = get_idf()
tf_idf = tf * idf
def cosine_similarity(q, _tf_idf):
unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0,\
keepdims=True))
similarity = unit_ds.T.dot(unit_q).ravel()
return similarity
def docs_score(q, len_norm=False):
q_words = q.replace(",", "").split(" ")
unknown_v = 0
for v in set(q_words):
if v not in v2i:
v2i[v] = len(v2i)
i2v[len(v2i)-1] = v
unknown_v += 1
if unknown_v > 0:
_idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)),\
axis=0)
_tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v,\
tf_idf.shape[1]),dtype=np.float)), axis=0)
else:
_idf, _tf_idf = idf, tf_idf
counter = Counter(q_words)
q_tf = np.zeros((len(_idf), 1), dtype=np.float)
for v in counter.keys():
q_tf[v2i[v], 0] = counter[v]
q_vec = q_tf * _idf
q_scores = cosine_similarity(q_vec, _tf_idf)
if len_norm:
len_docs = [len(d) for d in docs_words]
q_scores = q_scores / np.array(len_docs)
return q_scores
q = "I get a coffee cup"
scores = docs_score(q)
d_ids = scores.argsort()[-3:][::-1]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in d_ids]))
show_tfidf(tf_idf.T, [i2v[i] for i in range(tf_idf.shape[0])], "tfidf_matrix")
def get_keywords(n=2):
for c in range(3):
col = tf_idf[:, c]
idx = np.argsort(col)[-n:]
print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))
在 IF-IDF 这张巨大的分别代表文章索引和单词索引的二维表中,每篇文章不一定会提及到所有词汇,这些不提及的词汇可以不用存储,所以为了解决内存占用大的问题,利用稀疏矩阵(Sparse Matrix)。用 Skearn 模块的 Sparse Matrix 功能,能更快速,有效地计算和存储海量的数据
还是以上面的 made up 文档举例,如果直接调用 sklearn 的TF-IDF 功能,会方便很多:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from visual import show_tfidf
docs = [
"it is a good day, I like to stay here",
"I am happy to be here",
"I am bob",
"it is sunny today",
"I have a party today",
"it is a dog and that is a cat",
"there are dog and cat on the tree",
"I study hard this morning",
"today is a good day",
"tomorrow will be a good day",
"I like coffee, I like book and I like apple",
"I do not like it",
"I am kitty, I like bob",
"I do not care who like bob, but I like kitty",
"It is coffee time, bring your cup",
]
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(docs)
print("idf: ", [(n, idf) for idf, n in zip(vectorizer.idf_, vectorizer.get_feature_names())])
print("v2i: ", vectorizer.vocabulary_)
q = "I get a coffee cup"
qtf_idf = vectorizer.transform([q])
res = cosine_similarity(tf_idf, qtf_idf)
res = res.ravel().argsort()[-3:]
print("\ntop 3 docs for '{}':\n{}".format(q, [docs[i] for i in res[::-1]]))
i2v = {i: v for v, i in vectorizer.vocabulary_.items()}
dense_tfidf = tf_idf.todense()
show_tfidf(dense_tfidf, [i2v[i] for i in range(dense_tfidf.shape[1])], "tfidf_sklearn_matrix")
此外,也能用 TF-IDF 作为文档的训练数据,将文档向量化表示后,当做神经网络的输入,对这篇文档进行后续处理,比如分类,回归等拟合过程
Word2Vec
词向量是词语的向量表示,对于理解词语甚至是句子都有很强的适用性。Word2Vec 是一种将词转化为向量的方法,其包含两种算法,分别是 skip-gram 和 CBOW(Continuous Bag of Words),它们的最大区别是 skip-gram 是通过中心词去预测中心词周围的词,而 CBOW 是通过周围的词去预测中心词应用
CBOW
参考:https://mofanpy.com/tutorials/machine-learning/nlp/cbow/
CBOW 挑一个要预测的词,来学习这个词前后词语和预测词的关系
如有这样一句话
我爱莫烦Python,莫烦Python通俗易懂。
模型将这句话拆成输入和输出,用前后文的词向量来预测句中的某个词
这个模型的输入输出可以是:
# 1
# 输入:[我,爱] + [烦,Python]
# 输出:莫
# 2
# 输入:[爱,莫] + [Python,' ,']
# 输出:烦
# 3
# 输入:[莫,烦] + [',',莫]
# 输出:Python
# 4
# 输入:[烦,Python] + [莫,烦]
# 输出:','
通过在大数据量的短语或文章中学习这样的词语关系,这个模型就能理解要预测的词和前后文的关系。而图中彩色的词向量就是这种训练过程的一个副产品
为了做一个有区分力的词向量,这里做了一些假数据,想让计算机学会这些假词向量的正确向量空间:
from torch import nn
import torch
from torch.nn.functional import cross_entropy,softmax
from utils import process_w2v_data
from visual import show_w2v_word_embedding
corpus = [
"5 2 4 8 6 2 3 6 4",
"4 8 5 6 9 5 5 6",
"1 1 5 2 3 3 8",
"3 6 9 6 8 7 4 6 3",
"8 9 9 6 1 4 3 4",
"1 0 2 0 2 1 3 3 3 3 3",
"9 3 3 0 1 4 7 8",
"9 9 8 5 6 7 1 2 3 0 1 0",
"a t g q e h 9 u f",
"e q y u o i p s",
"q o 9 p l k j o k k o p",
"h g y i u t t a e q",
"i k d q r e 9 e a d",
"o p d g 9 s a f g a",
"i u y g h k l a s w",
"o l u y a o g f s",
"o p i u y g d a s j d l",
"u k i l o 9 l j s",
"y g i s h k j l f r f",
"i o h n 9 9 d 9 f a 9",
]
class CBOW(nn.Module):
def __init__(self, v_dim, emb_dim):
super().__init__()
self.v_dim = v_dim
self.embeddings = nn.Embedding(num_embeddings=v_dim, embedding_dim=emb_dim)
self.embeddings.weight.data.normal_(0, 0.1)
self.hidden_out = nn.Linear(emb_dim, v_dim)
self.opt = torch.optim.SGD(self.parameters(),momentum=0.9, lr=0.01)
def forward(self, x, training=None, mask=None):
o = self.embeddings(x)
o = torch.mean(o, dim=1)
return o
def loss(self, x, y, training=None):
embedded = self(x, training)
pred = self.hidden_out(embedded)
return cross_entropy(pred, y)
def step(self, x, y):
self.opt.zero_grad()
loss = self.loss(x, y, True)
loss.backward()
self.opt.step()
return loss
def train(model, data):
if torch.cuda.is_available():
print("GPU train avaliable")
device =torch.device("cuda")
model = model.cuda()
else:
device = torch.device("cpu")
model = model.cpu()
for t in range(800000):
bx, by = data.sample(100)
bx, by = torch.from_numpy(bx).to(device), torch.from_numpy(by).to(device)
loss = model.step(bx, by)
if t%200 == 0:
print(f"step: {t} | loss: {loss}")
if __name__ == "__main__":
d = process_w2v_data(corpus, skip_window=2, method="cbow")
m = CBOW(d.num_word, 2)
train(m, d)
show_w2v_word_embedding(m,d,"./visual/results/cbow.png")
常用的方式是将这些训练好的词向量当做预训练模型,然后放入另一个神经网络(比如RNN)当成输入,使用另一个神经网络加工后,训练句向量
Skip-Gram
参考:https://mofanpy.com/tutorials/machine-learning/nlp/skip-gram/
相比 CBOW 使用上下文来预测上下文之间的结果,Skip-Gram 则是把这个过程反过来,使用文中的某个词,然后预测这个词周边的词
Skip-Gram 相比 CBOW 最大的不同,就是剔除了中间的那个 SUM 求和的过程,加工的始终是输入端单个词向量
使用和上文一样的假数据:
from torch import nn
import torch
from torch.nn.functional import cross_entropy
from utils import Dataset,process_w2v_data
from visual import show_w2v_word_embedding
corpus = [
"5 2 4 8 6 2 3 6 4",
"4 8 5 6 9 5 5 6",
"1 1 5 2 3 3 8",
"3 6 9 6 8 7 4 6 3",
"8 9 9 6 1 4 3 4",
"1 0 2 0 2 1 3 3 3 3 3",
"9 3 3 0 1 4 7 8",
"9 9 8 5 6 7 1 2 3 0 1 0",
"a t g q e h 9 u f",
"e q y u o i p s",
"q o 9 p l k j o k k o p",
"h g y i u t t a e q",
"i k d q r e 9 e a d",
"o p d g 9 s a f g a",
"i u y g h k l a s w",
"o l u y a o g f s",
"o p i u y g d a s j d l",
"u k i l o 9 l j s",
"y g i s h k j l f r f",
"i o h n 9 9 d 9 f a 9",
]
class SkipGram(nn.Module):
def __init__(self,v_dim,emb_dim):
super().__init__()
self.v_dim = v_dim
self.embeddings = nn.Embedding(v_dim, emb_dim)
self.embeddings.weight.data.normal_(0,0.1)
self.hidden_out = nn.Linear(emb_dim, v_dim)
self.opt = torch.optim.Adam(self.parameters(), lr=0.01)
def forward(self,x,training=None, mask=None):
o = self.embeddings(x)
return o
def loss(self,x,y,training=None):
embedded = self(x, training)
pred= self.hidden_out(embedded)
return cross_entropy(pred,y)
def step(self,x,y):
self.opt.zero_grad()
loss = self.loss(x, y, True)
loss.backward()
self.opt.step()
return loss
def train(model,data):
if torch.cuda.is_available():
print("GPU train avaliable")
device =torch.device("cuda")
model = model.cuda()
else:
device = torch.device("cpu")
model = model.cpu()
for t in range(2500):
bx, by = data.sample(8)
bx, by = torch.from_numpy(bx).to(device), torch.from_numpy(by).to(device)
loss = model.step(bx, by)
if t%200 == 0:
print(f"step: {t} | loss: {loss}")
if __name__ == "__main__":
d = process_w2v_data(corpus, skip_window=2, method="skip_gram")
m = SkipGram(d.num_word, 2)
train(m, d)
show_w2v_word_embedding(m,d,"./visual/results/skipgram.png")
|