数据来源:请点击这里 先研究清楚数据。 代码:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
np.set_printoptions(threshold=10000)
np.set_printoptions(threshold=10000)
csvdata = pd.read_csv('healthcare_dataset_stroke_data.csv')
csvdata['bmi'] = csvdata['bmi'].fillna(csvdata['bmi'].mean())
print(csvdata.shape)
print('\n')
print(csvdata.isnull().sum())
print('\n')
print(csvdata.dtypes)
print('\n')
print(csvdata['gender'].value_counts())
print('\n')
print(csvdata['ever_married'].value_counts())
print('\n')
print(csvdata['work_type'].value_counts())
print('\n')
print(csvdata['Residence_type'].value_counts())
print('\n')
print(csvdata['smoking_status'].value_counts())
print('\n')
print(csvdata['stroke'].value_counts())
plt.figure(figsize=(12, 10))
ax = sns.heatmap(csvdata.corr())
fig = plt.figure(figsize=(20, 15))
ax = fig.gca()
csvdata.hist(ax=ax)
feature_data = csvdata.loc[:, ["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "stroke"]]
text_data = csvdata.loc[:, ["gender", "work_type", "Residence_type", "smoking_status"]]
size_mapping = {'Yes': 1, 'No': 0}
feature_data['ever_married'] = feature_data['ever_married'].map(size_mapping)
pd.get_dummies(text_data, prefix=["gender", "work_type", "Residence_type", "smoking_status"])
feature_data = feature_data.join(pd.get_dummies(text_data, prefix=["gender", "work_type", "Residence_type", "smoking_status"]))
data0_len = len(feature_data[feature_data['stroke'] == 0])
data0_index = feature_data[feature_data['stroke'] == 0].index
data1_len = len(feature_data[feature_data['stroke'] == 1])
data1_index = feature_data[feature_data['stroke'] == 1].index
random_index = np.random.choice(data0_index, data1_len)
altogether_index = np.concatenate([data1_index, random_index])
altogether_data = feature_data.iloc[altogether_index, :]
std = StandardScaler()
standard_data = std.fit_transform(altogether_data)
standard_data = pd.DataFrame(standard_data, columns=feature_data.columns)
train_data = standard_data.iloc[:, standard_data.columns != 'stroke']
test_data = standard_data.iloc[:, standard_data.columns == 'stroke']
X_train, X_test, y_train, y_test = train_test_split(train_data, test_data, test_size=0.3)
print(y_train)
lr_1 = LogisticRegression()
lr_1.fit(X_train, y_train)
prediction_lr_1 = lr_1.predict(X_test)
print(confusion_matrix(y_test, prediction_lr_1))
print(classification_report(y_test, prediction_lr_1))
print(roc_auc_score(y_test, prediction_lr_1))
结果:
|