前言
大多数时候,人们并非刻意说谎,他们只是对自己说出的话没有真正深入的了解。
字符串艹作
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
import cv2
from matplotlib import pyplot as plt
import re
text=[
' interrobang. By De.,mllie Suatin Chertanis ',
' Paking and, Going. ',
' Today is the night. '
]
print([s.strip() for s in text])
print([s.replace('.','') for s in text])
def capitalizer(s:str)->str:
return s.upper()
print([capitalizer(s) for s in text])
def replace_with_X(s:str)->str:
return re.sub(r'[a-zA-Z]','X',s)
print([replace_with_X(s) for s in text])
标注词性
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
import cv2
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
text=np.array([
'I love espa?ol and espa?a',
'German i best',
'Demllie Suatin Chertanis Zhangqi'
])
c=CountVectorizer()
bag=c.fit_transform(text)
print(bag)
(0, 7) 1
(0, 5) 1
(0, 0) 1
(0, 4) 1
(1, 6) 1
(1, 1) 1
(2, 3) 1
(2, 8) 1
(2, 2) 1
(2, 9) 1
单词加权
import numpy as np
from sklearn import feature_extraction, preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
import cv2
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
text=np.array([
'I love espa?ol and espa?a',
'German i best',
'Demllie Suatin Chertanis Zhangqi'
])
t=TfidfVectorizer()
feature_matrix=t.fit_transform(text)
print(feature_matrix)
print('------------------------------------')
print(feature_matrix.toarray())
print('------------------------------------')
print(t.vocabulary_)
(0, 4) 0.5
(0, 0) 0.5
(0, 5) 0.5
(0, 7) 0.5
(1, 1) 0.7071067811865476
(1, 6) 0.7071067811865476
(2, 9) 0.5
(2, 2) 0.5
(2, 8) 0.5
(2, 3) 0.5
------------------------------------
[[0.5 0. 0. 0. 0.5 0.5
0. 0.5 0. 0. ]
[0. 0.70710678 0. 0. 0. 0.
0.70710678 0. 0. 0. ]
[0. 0. 0.5 0.5 0. 0.
0. 0. 0.5 0.5 ]]
------------------------------------
{'love': 7, 'espa?ol': 5, 'and': 0, 'espa?a': 4, 'german': 6, 'best': 1, 'demllie': 3, 'suatin': 8, 'chertanis': 2, 'zhangqi': 9}
生活好无聊啊,因为别人的某个想法,就要浪费一周的时间,命运不在自己手里的感觉真不舒服。活这么大,对命运的反抗除了摆烂也没有手段了。一两年的摆烂,要花多长时间才能纠正过来,还是说,人生就这么完了算了。
|