from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
df_train = pd.read_table("train_fc.txt", names=['q1', 'q2', 'q3', 'q4','label','q5']).fillna("0")
df_test = pd.read_table("test_fc.txt", names=['q1', 'q2', 'q3', 'q4','label','q5']).fillna("0")
df_train['text'] = df_train['q1'] + " " + df_train['q2'] + " " + df_train['q3'] + " " + df_train['q5']
df_test['text'] = df_test['q1'] + " " + df_test['q2'] + " " + df_test['q3'] + " " + df_test['q5']
tfidf_train = TfidfVectorizer()
tfidf_test = TfidfVectorizer()
tfidf_feature_train = tfidf_train.fit_transform(df_train['text'])
tfidf_feature_test = tfidf_test.fit_transform(df_test['text'])
svd_feature_train = TruncatedSVD(n_components=100).fit_transform(tfidf_feature_train)
svd_feature_test = TruncatedSVD(n_components=100).fit_transform(tfidf_feature_test)
svd_feature_train.shape
svd_feature_test.shape
tfidf_train.get_feature_names()
from sklearn.svm import SVC
df_train['label']
clf = SVC(kernel='linear')
clf.fit(svd_feature_train,np.array(df_train['label']))
predicted = clf.predict(svd_feature_test)
predicted.shape
df_sj = pd.read_table("task1.testSentence.new", names=['q1', 'q2', 'q3', 'q4','label','q5','q6','q7','q8','q9']).fillna("0")
sj = []
sj = df_sj['label']
sj.shape
from sklearn import metrics
def metrics_result(actual, predict):
print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted')))
print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted')))
print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted')))
metrics_result(sj, predicted)
|