import numpy as np
import pandas as pd
import os
os.chdir("C:\\Users\\Administrator\\Desktop")
data = pd.read_excel("missing.xlsx")
print(data)
a b c d
0 2.0 kj 4.0 7.0
1 2.0 kl 6.0 9.0
2 NaN kl 5.0 NaN
3 5.0 NaN NaN 9.0
4 6.0 kk 6.0 8.0
c = np.array([[1,2,3,4],[4,5,6,np.nan],[5,6,7,8],[9,4,np.nan,8]])
C = pd.DataFrame(c)
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
fC = C
imp = SimpleImputer(np.nan,"mean")
fC = imp.fit_transform(fC)
print(fC)
[[1. 2. 3. 4. ]
[4. 5. 6. 6.66666667]
[5. 6. 7. 8. ]
[9. 4. 5.33333333 8. ]]
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=mean as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
fc = c
imp = SimpleImputer( np.nan,"median")
fc = imp.fit_transform(fc)
print(fc)
[[1. 2. 3. 4.]
[4. 5. 6. 8.]
[5. 6. 7. 8.]
[9. 4. 6. 8.]]
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=median as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
fD = data[["a","c"]]
imp = SimpleImputer( np.nan,"most_frequent")
fD = imp.fit_transform(fD)
print(fD)
[[2. 4.]
[2. 6.]
[2. 5.]
[5. 6.]
[6. 6.]]
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=most_frequent as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
data1 = np.load("data.npy")
print(data1)
[[1.00000000e+00 1.70000000e+01 6.61764706e+01 3.20000000e+01
1.61496618e+03 1.31562500e+01]
[2.00000000e+00 8.00000000e+00 6.86875000e+01 3.60000000e+01
1.43564581e+02 3.80555556e+00]
[3.00000000e+00 1.60000000e+01 6.58437500e+01 4.30000000e+01
1.34413138e+03 1.26976744e+01]
...
[8.33000000e+02 1.00000000e+01 6.79500000e+01 2.40000000e+01
1.15874171e+02 2.79166667e+00]
[8.34000000e+02 2.10000000e+01 6.65000000e+01 4.10000000e+01
5.38712893e+02 2.03170732e+01]
[8.35000000e+02 1.10000000e+01 7.82727273e+01 9.00000000e+00
6.29832333e+01 9.44444444e+00]]
data1 = data1[:,1:]
print(data1)
[[ 17. 66.17647059 32. 1614.96618125 13.15625 ]
[ 8. 68.6875 36. 143.56458056 3.80555556]
[ 16. 65.84375 43. 1344.13137674 12.69767442]
...
[ 10. 67.95 24. 115.87417083 2.79166667]
[ 21. 66.5 41. 538.71289268 20.31707317]
[ 11. 78.27272727 9. 62.98323333 9.44444444]]
imp1 = SimpleImputer( np.nan,"mean")
data1 = imp1.fit_transform(data1)
print(data1)
[[ 17. 66.17647059 32. 1614.96618125 13.15625 ]
[ 8. 68.6875 36. 143.56458056 3.80555556]
[ 16. 65.84375 43. 1344.13137674 12.69767442]
...
[ 10. 67.95 24. 115.87417083 2.79166667]
[ 21. 66.5 41. 538.71289268 20.31707317]
[ 11. 78.27272727 9. 62.98323333 9.44444444]]
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=mean as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
warnings.warn(f"Pass {args_msg} as keyword args. From version "
from sklearn.preprocessing import StandardScaler
x = data1
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)
print(x)
[[ 0.20025842 -0.82760637 0.05554634 2.84353829 0.76954149]
[-0.68918721 -0.09224269 0.20662516 -0.18554119 -0.65156099]
[ 0.10143112 -0.92504475 0.47101308 2.28598816 0.69984796]
...
[-0.49153262 -0.30822213 -0.24661129 -0.24254565 -0.80565008]
[ 0.59556758 -0.73285966 0.39547367 0.62792513 1.85783108]
[-0.39270533 2.71482439 -0.81315684 -0.35142881 0.20542766]]
from sklearn.preprocessing import MinMaxScaler
x1 = data1
mms = MinMaxScaler()
mms.fit(x1)
x1 = mms.transform(x1)
print(x1)
[[0.38095238 0.04406273 0.2519685 0.33941778 0.1381392 ]
[0.16666667 0.17158327 0.28346457 0.03015525 0.03188131]
[0.35714286 0.0271658 0.33858268 0.28249311 0.13292812]
...
[0.21428571 0.13412995 0.18897638 0.02433521 0.02035985]
[0.47619048 0.06049291 0.32283465 0.11320842 0.2195122 ]
[0.23809524 0.65836106 0.07086614 0.01321848 0.0959596 ]]
Data = pd.read_excel('农村居民人均可支配收入来源2016.xlsx')
X = Data.iloc[:,1:]
R = X.corr()
print(R)
工资性收入 经营净收入 财产净收入 转移净收入
工资性收入 1.000000 -0.388997 0.826683 0.401917
经营净收入 -0.388997 1.000000 -0.205737 -0.314542
财产净收入 0.826683 -0.205737 1.000000 0.297458
转移净收入 0.401917 -0.314542 0.297458 1.000000
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
pca.fit(X)
Y= pca.transform(X)
tzxl=pca.components_
tz=pca.explained_variance_
gxl=pca.explained_variance_ratio_
Y00=sum(X[0,:]*tzxl[0,:])
Y01=sum(X[1,:]*tzxl[0,:])
Y02=sum(X[2,:]*tzxl[0,:])
Y03=sum(X[3,:]*tzxl[0,:])
print(Y)
print(gxl)
[[ 4.33588394 1.2871025 1.68836853]
[ 1.53736203 2.12655221 0.56027639]
[-0.46694468 0.18931331 1.38413422]
[-0.10431271 -1.123358 1.17241467]
[-0.55472192 0.67855307 -1.03614749]
[-0.7860694 0.60996214 -0.01608431]
[-1.74256785 1.33152775 -1.02791134]
[-0.43551643 1.06930104 -0.9947622 ]
[ 5.26192283 -1.30032837 -0.65586678]
[ 1.26240024 0.53263985 -1.0411674 ]
[ 1.66736332 1.59956567 -0.2701487 ]
[-0.25754205 -0.62850484 -0.50489853]
[-0.44568285 0.65255314 -0.33371388]
[-0.39402814 -0.24401899 -0.0721038 ]
[-0.79480747 1.26287224 -0.06924108]
[-0.31006603 -0.65108872 -0.54600265]
[-0.37598829 -0.48118327 -1.40014355]
[-0.08226864 -0.94042225 -0.36095876]
[ 0.97778119 -0.73376053 -0.52563007]
[-0.61802252 -0.67589739 -0.56974344]
[-1.17447251 0.42174493 0.46729999]
[ 0.37012714 -1.03216645 -0.91151552]
[-0.1364243 -0.56892611 -0.50361372]
[-0.87126924 -0.98624593 1.31181932]
[-1.62901512 0.41953719 0.98034596]
[-1.58629868 0.32183819 0.58906561]
[-0.18741213 -1.21196511 0.65763622]
[-0.78267223 -1.0472276 0.9187884 ]
[ 0.17356232 -1.22932478 0.06572846]
[-0.53868768 -0.12444272 0.97917061]
[-1.31161213 0.47579784 0.06460487]]
[0.5676807 0.22505502 0.1701918 ]
F=gxl[0]*Y[:,0]+gxl[1]*Y[:,1]+gxl[2]*Y[:,2]
dq=list(Data['地区'].values)
Rs=pd.Series(F,index=dq)
Rs=Rs.sort_values(ascending=False)
print(Rs)
北京 3.038413
上海 2.582823
天津 1.446676
浙江 1.260543
江苏 0.659315
广东 0.300473
河北 0.013099
山西 -0.112498
福建 -0.162941
青海 -0.166951
宁夏 -0.167162
黑龙江 -0.175883
重庆 -0.177313
山东 -0.178765
陕西 -0.267225
江西 -0.290871
四川 -0.291196
辽宁 -0.311699
湖南 -0.319781
内蒙古 -0.338537
安徽 -0.373579
河南 -0.415474
海南 -0.492279
贵州 -0.493301
甘肃 -0.523621
湖北 -0.560027
广西 -0.599919
新疆 -0.626501
云南 -0.663495
西藏 -0.727826
吉林 -0.864497
dtype: float64
|