import pandas as pd
import numpy as np
train_zh= pd.read_excel("data/ÖÐÎÄ_trian.xlsx")
train_zh.head()
| ÔʼÎı¾ | Òâͼ | ²ÛÖµ1 | ²ÛÖµ2 |
---|
0 | 16.5¶È | adjust_ac_temperature_to_number | offset:16.5 | NaN |
---|
1 | 16¶È | adjust_ac_temperature_to_number | offset:16 | NaN |
---|
2 | 16¿Õµ÷¿ªµ½16¶È | adjust_ac_temperature_to_number | offset:16 | NaN |
---|
3 | 16ζÈ16¶È | adjust_ac_temperature_to_number | offset:16 | NaN |
---|
4 | 17¶È | adjust_ac_temperature_to_number | offset:17 | NaN |
---|
train_en = pd.read_excel("data/Ó¢ÎÄ_train.xlsx")
train_en.head()
| ÔʼÎı¾ | ÖÐÎÄ·Òë | Òâͼ | ²ÛÖµ1 | ²ÛÖµ2 |
---|
0 | open aircon please | Çë´ò¿ª¿Õµ÷ | open_ac | NaN | NaN |
---|
1 | I want to activate the AC | ÎÒÏë´ò¿ª¿Õµ÷ | open_ac | NaN | NaN |
---|
2 | I want to turn on the air conditioner | ÎÒÏë´ò¿ª¿Õµ÷ | open_ac | NaN | NaN |
---|
3 | switch on the AC please | Çë´ò¿ª¿Õµ÷ | open_ac | NaN | NaN |
---|
4 | Help me open the AC | °ïÎÒ´ò¿ª¿Õµ÷ | open_ac | NaN | NaN |
---|
train_ja = pd.read_excel("data/ÈÕÓï_train.xlsx")
train_ja.head()
| ÔʼÎı¾ | ÖÐÎÄ·Òë | Òâͼ | ²ÛÖµ1 | ²ÛÖµ2 |
---|
0 | ¥¨¥¢¥³¥ó¤Î¥¹¥¤¥Ã¥ÁON¤Ë | ´ò¿ª¿Õµ÷¿ª¹Ø | open_ac | NaN | NaN |
---|
1 | ¥¨¥¢¥³¥óÈë¤ì¤ë | ´ò¿ª¿Õµ÷ | open_ac | NaN | NaN |
---|
2 | ¥¨¥¢¥³¥ó¤Î¥¹¥¤¥Ã¥Á¤ò | ´ò¿ª¿Õµ÷¿ª¹Ø | open_ac | NaN | NaN |
---|
3 | ¥¨¥¢¥³¥ó¤Î¥¹¥¤¥Ã¥ÁÈë¤ì¤ë | ´ò¿ª¿Õµ÷¿ª¹Ø | open_ac | NaN | NaN |
---|
4 | ¥¨¥¢¥³¥ó¤Î¥¹¥¤¥Ã¥ÁON | ´ò¿ª¿Õµ÷¿ª¹Ø | open_ac | NaN | NaN |
---|
test_ja = pd.read_excel("data/testA.xlsx",sheet_name="ÈÕÓï_testA")
test_en = pd.read_excel("data/testA.xlsx",sheet_name="Ó¢ÎÄ_testA")
test_ja.head()
| ÔʼÎı¾ |
---|
0 | ¥¨¥¢¥³¥ó¤ò¤Ä¤±¤ë |
---|
1 | ¥¨¥¢¥³¥ó¸¶¤±¤ë |
---|
2 | ¥¨¥¢¥³¥ó¤ò¤Ä¤±¤Æ¤¯¤À¤µ¤¤ |
---|
3 | ¥¨¥¢¥³¥ó¤òON¤Ë |
---|
4 | ¥¨¥¢¥³¥ó |
---|
test_en.head()
| ÔʼÎı¾ |
---|
0 | switch on the AC |
---|
1 | air conditioner open |
---|
2 | Turn on the AC please |
---|
3 | I wanna switch on aircon please |
---|
4 | Help me switch on aircon |
---|
import jieba
import jieba.posseg as pseg
import nagisa
[dynet] random seed: 1234
[dynet] allocating memory: 32MB
[dynet] memory allocation done.
text = 'Python¤Çº†…g¤Ëʹ¤¨¤ë¥Ä©`¥ë¤Ç¤¹'
words = nagisa.tagging(text)
print(words.words)
['Python', '¤Ç', 'º†…g', '¤Ë', 'ʹ¤¨¤ë', '¥Ä©`¥ë', '¤Ç¤¹']
words = jieba.lcut("²é¿´ÑµÁ·¼¯ºÍ²âÊÔ¼¯×Ö¶ÎÀàÐÍ,²¢½«Êý¾Ý¶ÁÈ¡´úÂëдµ½²©¿Í;")
print(words)
['²é¿´', 'ѵÁ·', '¼¯', 'ºÍ', '²âÊÔ', '¼¯×Ö¶Î', 'ÀàÐÍ', ',', '²¢', '½«', 'Êý¾Ý', '¶ÁÈ¡', '´úÂë', 'д', 'µ½', '²©¿Í', ';']
words = pseg.lcut("²é¿´ÑµÁ·¼¯ºÍ²âÊÔ¼¯×Ö¶ÎÀàÐÍ,²¢½«Êý¾Ý¶ÁÈ¡´úÂëдµ½²©¿Í;")
print(words)
[pair('²é¿´', 'v'), pair('ѵÁ·', 'vn'), pair('¼¯', 'q'), pair('ºÍ', 'c'), pair('²âÊÔ', 'vn'), pair('¼¯×Ö¶Î', 'n'), pair('ÀàÐÍ', 'n'), pair(',', 'x'), pair('²¢', 'c'), pair('½«', 'd'), pair('Êý¾Ý', 'n'), pair('¶ÁÈ¡', 'v'), pair('´úÂë', 'n'), pair('д', 'v'), pair('µ½', 'v'), pair('²©¿Í', 'nr'), pair(';', 'x')]
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
train_ja["words"] = train_ja["ÔʼÎı¾"].apply(lambda x: " ".join(nagisa.tagging(x).words))
train_en["words"] = train_en["ÔʼÎı¾"].apply(lambda x: x.lower())
test_ja['words'] = test_ja['ÔʼÎı¾'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
test_en['words'] = test_en['ÔʼÎı¾'].apply(lambda x: x.lower())
pipline = make_pipeline(TfidfVectorizer(),LogisticRegression())
pipline.fit(train_ja["words"].to_list() + train_en["words"].to_list(),
train_ja["Òâͼ"].to_list() + train_en["Òâͼ"].to_list())
test_ja["Òâͼ"] = pipline.predict(test_ja['words'])
test_en["Òâͼ"] = pipline.predict(test_en['words'])
test_en['²ÛÖµ1'] = np.nan
test_en['²ÛÖµ2'] = np.nan
test_ja['²ÛÖµ1'] = np.nan
test_ja['²ÛÖµ2'] = np.nan
writer = pd.ExcelWriter("submit_LR.xlsx")
test_en.drop(["words"],axis=1).to_excel(writer, sheet_name='Ó¢ÎÄ_testA', index=None)
test_ja.drop(["words"],axis=1).to_excel(writer,sheet_name="ÈÕÓï_testA",index=None)
writer.save()
writer.close()
Ìá½»½á¹û:
|