import pandas as pd
import numpy as np
import math
import csv
数据预处理
data = pd.read_csv("/Users/tiger/Desktop/study/机器学习/李宏毅机器学习/李宏毅机器学习资料/数据/hw1/train.csv", encoding="big5")
data = data.iloc[:, 3:]
data[data == "NR"] = 0
arr_data = data.to_numpy()
arr_data
array([['14', '14', '14', ..., '15', '15', '15'],
['1.8', '1.8', '1.8', ..., '1.8', '1.8', '1.8'],
['0.51', '0.41', '0.39', ..., '0.35', '0.36', '0.32'],
...,
['36', '55', '72', ..., '118', '100', '105'],
['1.9', '2.4', '1.9', ..., '1.5', '2', '2'],
['0.7', '0.8', '1.8', ..., '1.6', '1.8', '2']], dtype=object)
"""来的思路就是:把一个月中的每一天的数据放到宏观的一行上去,这样一个月的每个小时就会连接起来,
题目要求是输入连续的9个小时的数据来预测第10个小时的PM2.5的值,这样做会增加我们的数据量,
因为如果将每一天割裂开来看的话,每天只有24个小时,这样得到每天有15组数据,如果将每个月的20天连在一起看的话,会增加我们的数据量"""
month_data = {}
for month in range(12):
sample = np.empty([18, 24 * 20])
for day in range(20):
sample[:, (day * 24):(day + 1) * 24] = arr_data[18 * (month * 20 + day) : 18 * (month * 20 + day + 1), :]
month_data[month] = sample
x = np.empty([12 * 471, 18 * 9], dtype=float)
y = np.empty([12 * 471, 1], dtype=float)
for month in range(12):
for day in range(20):
for hours in range(24):
if day == 19 and hours > 14:
continue
x[month * 471 + day * 24 + hours, :] = month_data[month][:, day * 24 + hours : day * 24 + hours + 9].reshape(1, -1)
y[month * 471 + day * 24 + hours, 0] = month_data[month][9, day * 24 + hours + 9]
数据标准化和训练集分类
z-score标准化 z-score标准化也叫做标准差标准化,这种方法给予原属数据的均值(mean)和标准差(standard deviation)进行数据的标准化, 经过处理的数据符合标准正态分布,即均值为0,标准差为1。 转换函数为:
x
?
=
(
x
?
μ
)
/
σ
x^*=(x - \mu ) / \sigma
x?=(x?μ)/σ,其中
μ
\mu
μ为所有样本数据的均值,
σ
\sigma
σ为所有样本数据的标准差。 z-score表转化方法适用于属性A的最大值和最小值未知的情况,或有超出取值范围的离群数据的情况,要求原始数据的分布近似为高斯分布,否则效果很差。
mean_x = np.mean(x, axis=0)
std_x = np.std(x, axis=0)
for i in range(len(x)):
for j in range(len(x[0])):
if std_x[j] != 0:
x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]
x_train_set = x[:math.floor(len(x) * 0.8), :]
y_train_set = y[:math.floor(len(y) * 0.8), :]
x_validation = x[math.floor(len(x) * 0.8):, :]
y_validation = y[math.floor(len(y) * 0.8):, :]
训练模型
dim = 18 * 9 + 1
w = np.ones([dim, 1])
x = np.concatenate((np.ones([12 * 471, 1]), x), axis=1).astype(float)
learning_rate = 100
iter_time = 1000
adagrad = np.zeros([dim, 1])
eps = 0.0000000001
for t in range(iter_time):
loss = np.sqrt(np.sum(np.power(np.dot(x, w)-y, 2)) / (471 * 12))
if t % 100 == 0:
print(str(loss))
gradient= 2 * np.dot(x.transpose(), np.dot(x, w) - y)
adagrad += gradient ** 2
w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
np.save('weight.npy', w)
51.65880600054525
20.77437564222039
14.127460058537881
11.303857670788974
9.757354637574426
8.790692587072341
8.132230172649813
7.657473854962996
7.30155272831573
7.027160886807601
载入验证集验证
w = np.load('weight.npy')
x_validation = np.concatenate((np.ones([1131, 1]), x_validation), axis=1).astype(float)
loss = np.sqrt(np.sum(np.power(np.dot(x_validation, w) - y_validation, 2)) / 1131)
print(loss)
6.396197121704969
预测testdata得到的预测结果
预测结果
id,value
id_0,10.684860338423547
id_1,16.792247467027792
id_2,21.13290914573666
id_3,4.786672166264047
id_4,26.432101029543443
id_5,20.661216398470728
id_6,22.82256193838437
id_7,29.214571337489453
id_8,17.088067086260295
id_9,60.673227496857464
id_10,18.46631184461022
id_11,8.434982922072763
id_12,59.35574122805243
id_13,50.02059833321406
id_14,20.79550695621464
id_15,12.053554778198952
id_16,33.754835175080906
id_17,66.96743889308122
id_18,-0.8295276120536883
id_19,14.777015305410803
id_20,43.6175032338172
id_21,71.71164467829185
id_22,6.4929465889962525
id_23,18.460168780066326
id_24,13.740735501504256
id_25,36.64832550171499
id_26,23.929430220148795
id_27,69.22176125529859
id_28,8.820272432213695
id_29,56.83516827299004
id_30,21.78811976062332
id_31,6.508471140832372
id_32,0.7105070939855977
id_33,19.409173742757318
id_34,28.49631194484278
id_35,37.184830919331205
id_36,42.638699501863655
id_37,27.92062068906882
id_38,34.00184464463722
id_39,32.776282584725735
id_40,0.3147471150095953
id_41,36.76773597455136
id_42,31.38574765130327
id_43,51.619654408540725
id_44,14.573671096022181
id_45,36.094422477026114
id_46,26.325604056692505
id_47,10.83157207605515
id_48,26.27368393160752
id_49,32.87018829008646
id_50,21.115252402995164
id_51,9.11658185777294
id_52,24.740215118968372
id_53,53.86433229948033
id_54,5.078052964091924
id_55,35.845988273365194
id_56,30.31828590622372
id_57,22.76174887515322
id_58,56.79572297160596
id_59,19.41980835383982
id_60,14.800465707889728
id_61,46.4298823306427
id_62,12.296255144242231
id_63,56.587068330899214
id_64,24.682926968653796
id_65,18.08471500735758
id_66,15.4747777481983
id_67,-1.5714839546253438
id_68,42.81756663081315
id_69,26.814103351740815
id_70,22.15278339231544
id_71,38.91154498276733
id_72,52.78645518848199
id_73,6.633607211863456
id_74,19.43262375530285
id_75,5.4295937891811406
id_76,38.297243422541634
id_77,13.376968247715695
id_78,21.79866468557152
id_79,22.461171385436437
id_80,24.93810899497257
id_81,36.90735643294475
id_82,26.72709247850098
id_83,86.20651363799146
id_84,31.09202882682575
id_85,26.048078588981255
id_86,22.753852116326858
id_87,29.286232026745235
id_88,23.013275219331362
id_89,22.419187499516244
id_90,40.09240468814562
id_91,35.64262994643662
id_92,10.906921649402973
id_93,40.481793701688154
id_94,49.12761549292607
id_95,21.827891816977342
id_96,32.06701565115583
id_97,11.405158819907697
id_98,21.924576837234063
id_99,6.876519069507893
id_100,16.37286087332243
id_101,28.84495778453531
id_102,13.452609812202255
id_103,18.43709336763687
id_104,23.50015647759229
id_105,36.57113161400477
id_106,16.11513000361527
id_107,8.823958814008407
id_108,10.817902482139923
id_109,78.48626325585083
id_110,46.057198070963494
id_111,13.533693145063895
id_112,27.317702868312853
id_113,18.078612385296374
id_114,16.859622181123385
id_115,25.83431782988086
id_116,24.001917149550522
id_117,16.8228637727887
id_118,17.761982799364375
id_119,18.62930797542638
id_120,90.61014485728737
id_121,19.68936709209532
id_122,17.03879198736964
id_123,25.597393006578464
id_124,7.342614024228427
id_125,37.06062273612895
id_126,10.119869757737142
id_127,20.41252056793337
id_128,29.820283281964144
id_129,64.72084276369596
id_130,21.19751919800778
id_131,24.76473026977343
id_132,70.8984026705134
id_133,12.454808924935392
id_134,14.99264556523324
id_135,4.596491768102348
id_136,10.81158755792682
id_137,62.348475327738136
id_138,17.743374282278605
id_139,10.00552822850095
id_140,28.316004854752478
id_141,26.929132700545072
id_142,49.88891166877117
id_143,18.607014064620024
id_144,14.837381630867483
id_145,28.754307275109028
id_146,13.341812892765828
id_147,48.93036067836921
id_148,23.37488606829497
id_149,34.564794236357244
id_150,9.787989988615191
id_151,11.238277525790686
id_152,24.277577033530036
id_153,7.841813189508759
id_154,12.028278920333422
id_155,42.12054647560005
id_156,16.512983574701853
id_157,35.474835784714784
id_158,11.019309333374501
id_159,19.224440755506585
id_160,38.55506886082005
id_161,17.826861486790918
id_162,10.427116128034768
id_163,8.783153033770331
id_164,50.68155380516629
id_165,33.729738809658116
id_166,0.5017788609262963
id_167,12.55657750611676
id_168,59.756278536297884
id_169,14.511272260568928
id_170,63.81949522375305
id_171,44.98301839936411
id_172,29.255027629788973
id_173,21.49224158468537
id_174,63.05990857749928
id_175,25.942458829875548
id_176,21.369667808738164
id_177,39.694681688461124
id_178,10.534607913797082
id_179,29.86544565576032
id_180,13.23069080071577
id_181,12.74458536516137
id_182,54.73714086762352
id_183,51.894349020844466
id_184,10.295565024339165
id_185,37.73732395348945
id_186,24.231082971070673
id_187,64.13898558696462
id_188,6.957416048140537
id_189,56.81016913625055
id_190,34.09906021029121
id_191,5.1871116156991235
id_192,32.07943714442466
id_193,0.06307467581223403
id_194,18.552046412640696
id_195,-3.1542322273857053
id_196,34.0673173295785
id_197,11.99973031590292
id_198,18.440571671476015
id_199,63.398411475723606
id_200,28.544839293955526
id_201,44.27732518305021
id_202,64.1996483242329
id_203,12.863245683452739
id_204,13.004498714142244
id_205,11.990468419218224
id_206,10.67900663677997
id_207,0.017342340157624037
id_208,121.05055732258415
id_209,13.5391969137977
id_210,11.843844090936052
id_211,17.551825360772064
id_212,36.056116567552635
id_213,38.310344915605555
id_214,20.846409274510982
id_215,31.349503047162237
id_216,74.68413290066617
id_217,0.4144316509729382
id_218,9.000804405713932
id_219,32.364929554155786
id_220,15.48355693014387
id_221,17.661845462257443
id_222,111.38530244120676
id_223,18.208975045563406
id_224,19.727670087479993
id_225,57.18039983803029
id_226,5.515303704573245
id_227,13.080393765911577
id_228,7.993930582824756
id_229,13.086480003576824
id_230,48.993489699833916
id_231,15.634289875845017
id_232,54.58982652477382
id_233,35.97284400113751
id_234,20.92766779248563
id_235,40.46786755315091
id_236,65.63489071420496
id_237,36.31885165054494
id_238,16.44903482540684
id_239,14.116096888299431
|