day1
import numpy as np
import pandas as pd
dataset = pd.read_csv('Data.csv',header = 0)
X = dataset.iloc[ : , :-1].values
Y = dataset.iloc[ : , 3].values
print(X)
print("------------处理Nan值-------------------")
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")
imputer = imputer.fit(X[ : , 1:3])
X[ : , 1:3] = imputer.transform(X[ : , 1:3])
print(X)
print("----------------将字母那种也处理成数据--------------------")
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])
print(X)
print("----------------独热编码-------------------------------")
"""
ColumnTransformer(transformers, -->(名称,转换器,列)元组的列表,指定要应用于数据子集的转换器对象。
remainder=‘drop’, -->默认为’drop’,未指定的列将被删除.通过指定
remainder=‘passthrough’,所有未指定的剩余列transformers将自动通过)
"""
ct = ColumnTransformer([('my_ohe', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
print(X)
print(Y)
print("----------------将数据集分割为训练集和测试集---------------------------")
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=0)
print(X_train)
print("*********")
print(X_test)
print("**********")
print(Y_train)
print("********")
print(Y_test)
print("-----------特征缩放-------------------")
'''
大多数机器学习算法在计算中使用两个点之间的欧氏距离,特征在幅度,单位和范围上引起了很大的变化
所以我们需要一个特征缩放
'''
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
print(X_train)
print("****")
print(X_test)
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
class StandScale:
def __init__(self):
self.mean_ = None
self.scale_ = None
def fit(self,x):
self.mean_ = np.array([np.mean(x[:,i]) for i in range(x.shape[1])])
self.scale_ = np.array([np.std(x[:,i]) for i in range(x.shape[1])])
def transform(self,x):
col_num = x.shape[1]
for i in range(col_num):
x[:,i] = (x[:,i] - self.mean_[i])/self.scale_[i]
data = datasets.load_iris()
data_x = data.data
data_y = data.target
data_name = data.target_names
print(type(data_y))
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y, test_size = 0.2)
print(np.mean(data_x[:,1]))
print(data_x.shape)
print(data_x.shape[1])
a = np.array([np.mean(data_x[:,i]) for i in range(data_x.shape[1])])
print(a)
standscale = StandScale()
standscale.fit(data_x)
standscale.transform(data_x)
print(data_x)
|