import numpy as np
import pandas as pd
path=r"C:\Users\Tsinghua-yincheng\Desktop\SZday74"
pop=pd.read_csv(path+"\\"+"state-population.csv")
areas=pd.read_csv(path+"\\"+"state-areas.csv")
abbrevs=pd.read_csv(path+"\\"+"state-abbrevs.csv")
pop.head(5)
areas.head(5)
abbrevs.head(5)
display(pop.shape,areas.shape,abbrevs.shape)
merged=pd.merge(pop,abbrevs,how="outer",
left_on="state/region",
right_on="abbreviation")
merged
merged=merged.drop("abbreviation",axis=1)
merged
merged.isnull().any()
merged[merged["population"].isnull()]
merged.loc[merged["state"].isnull(),"state/region"]
merged.loc[merged["state"].isnull(),"state/region"].unique()
merged.loc[merged["state/region"]=="PR","state"]="Puerto Rico"
merged.loc[merged["state/region"]=="USA","state"]=\
"United States"
merged.isnull().any()
merged
final=pd.merge(merged,areas,on="state",how="left")
final
final.shape
final.isnull().any()
final["state"][final["area (sq. mi)"].isnull()]
final["state"][final["area (sq. mi)"].isnull()].unique()
final.dropna(inplace=True)
final
final.isnull().any()
final
data2010=final.query("year==2010 & ages=='total'")
data2010
data2010.shape
data2010.set_index("state",inplace=True)
data2010
density=data2010["population"].div(data2010["area (sq. mi)"])
density
density.sort_values(ascending=False,inplace=True)
density.head(10)
density.tail(10)
|