GDP 상위 10개국의 환율과 GDP 11위인 대한민국의 KOSPI/KOSDAQ 상관관계

데이터 수집

네이버 증권 페이지에서 GDP 상위 10개국 환율과 대한민국 KOSPI/KOSDAQ 크롤링
100일 분량

KOSPI

1	import requests

def get_stock_data(code, page_size=100, page=1):
    url = "https://m.stock.naver.com/api/json/sise/dailySiseIndexListJson.nhn?code={}&pageSize={}&page={}".format(code, page_size, page)
    response = requests.get(url)
    datas = response.json()['result']['siseList']
    return pd.DataFrame(datas)

1 2	kosdaq_df = get_stock_data("KOSDAQ") kosdaq_df.tail(2)

	cd	dt	ncv	cv	cr	ov	hv	lv
98	KOSDAQ	20200207	672.63	-0.06	-0.01	673.11	674.03	667.93
99	KOSDAQ	20200206	672.69	11.37	1.72	665.94	672.69	665.88

1 2	kospi_df = get_stock_data("KOSPI") kospi_df.tail(2)

	cd	dt	ncv	cv	cr	ov	hv	lv
98	KOSPI	20200207	2211.95	-15.99	-0.72	2221.49	2223.92	2198.14
99	KOSPI	20200206	2227.94	62.31	2.88	2184.77	2229.99	2182.25

df_kor = pd.merge(kospi_df, kosdaq_df, left_on="dt", right_on="dt")
df_kor = df_kor.rename(columns={"ncv_x": "kospi", "ncv_y": "kosdaq"})
df_kor = df_kor[["dt", "kospi", "kosdaq"]]
df_kor = df_kor.set_index('dt')
df_kor.tail(2)

	kospi	kosdaq
dt
20200207	2211.95	672.63
20200206	2227.94	672.69

환율

def get_currency_data(code, page_size=100, page=1):
    url = "https://m.stock.naver.com/api/json/marketindex/marketIndexDay.nhn?marketIndexCd={}&pageSize={}&page={}".format(code, page_size, page)
    params = {
    "cookie": "NNB=ITGWQQXTH6IV4; NRTK=ag#all_gr#1_ma#-2_si#0_en#0_sp#0; ASID=d3db6cfa0000017292952d4c00006e70; nx_ssl=2; notice_new_account_171101=1; MM_NEW=1; NFS=2; MM_NOW_COACH=1; page_uid=UYoLesprvh8ssZHPceGssssssLl-298586; nid_inf=-1468512193; NID_AUT=RzrySnbsg4az0Z1kbSBCoLIC+oPS+1qfJCrnfq+S6eQVpRQOk2a/c8WWUJySQubY; NID_JKL=mueUVlznsILAKlGNEEOZL2gSa7GClv2m/wy+sWHeJP8=; NID_SES=AAABf7yRp/MIC8NRjyN8FNZm5ZIzE6EmXPjzoT8h5j5Tk6qVf38M4x3lGhbka33XB1/8dRoxW10UNBGHVSCOLiorv+O/Huw4fvZ6wkg8sXn6GD2QoxhPliJuh2e6WWXKTRRQq+pQRa/T1glGGsCR4PUMM7OlQ/p7ORm9ByQSdvLKvXbBBk0b6l/YSPZugdtVi44zKevugHN6UhOa/YfPNVa/ppkZ7MUh9Z2PK9F7SL1u7DmKpHAPREqDOgadvt7BPCWGhzmpZLycXPnMxsP4c3+Vf9u4Uw87TGtst/0xbetORWritqcMQV+9UW+zmb+pEnxBhLLQUEYp8jBtu8NRN/wCRj++ryNMaVdk5/uOdRcImcjbRlYuZ6xQsF+l2f6L470CNqlM9SJKFEZ9N/0X2yk+al75Jmqgpxw6WjmcX+G2lqoyobjBja0LzXW/Eks/EqatKybMe2DrqFjgpW1EKwjb180PaqehezYvyRRe2QSlMmzxe1FGhZNUfok90ZXGhp5yKQ==; XSRF-TOKEN=ca183283-9562-4e61-817f-db7653bb0636; JSESSIONID=767616970B529D565974D21FBE89A761; BMR="
    }
    response = requests.get(url, headers=params)
    datas = response.json()['result']['marketIndexDay']
    df = pd.DataFrame(datas)
    df = df[["dt", "nv"]]
    exec('df.columns = ["dt", "{}_nv"]'.format(code))
    df = df.set_index('dt')
    return df

1
2
3

# 미국
usd_df = get_currency_data("FX_USDKRW")
#usd_df.tail(2)

1
2
3

# 중국
cny_df = get_currency_data("FX_CNYKRW")
# cny_df.tail(2)

1
2
3

# 일본
jpy_df = get_currency_data("FX_JPYKRW")
# jpy_df.tail(2)

1
2
3

# 유로 (독일, 프랑스, 이탈리아)
eur_df = get_currency_data("FX_EURKRW")
# eur_df.tail(2)

1
2
3

# 인도
inr_df = get_currency_data("FX_INRKRW")
# inr_df.tail(2)

1
2
3

# 영국
gbp_df = get_currency_data("FX_GBPKRW")
# gbp_df.tail(2)

1
2
3

# 브라질
brl_df = get_currency_data("FX_BRLKRW")
# brl_df.tail(2)

1
2
3

# 캐나다
cad_df = get_currency_data("FX_CADKRW")
# cad_df.tail(2)

데이터 정리

1
2
3

cur = pd.concat([usd_df, cny_df, jpy_df, eur_df, inr_df, gbp_df, brl_df, cad_df], axis=1, ignore_index=True)
cur.columns = ['usd', 'cny', 'jpy', 'eur', 'inr', 'gbp', 'brl', 'cad']
cur

	usd	cny	jpy	eur	inr	gbp	brl	cad
dt
20200630	1202.3	170.01	1115.88	1348.56	15.92	1476.48	222.50	878.59
20200629	1200.5	169.78	1120.03	1354.34	15.90	1479.98	218.84	879.00
20200626	1200.0	169.41	1123.02	1347.48	15.87	1489.86	223.85	879.96
20200625	1203.5	169.94	1122.20	1349.42	15.92	1497.52	225.03	883.69
20200624	1202.5	170.27	1128.63	1359.55	15.90	1505.89	233.38	886.41
...	...	...	...	...	...	...	...	...
20200212	1180.0	169.27	1072.97	1288.21	16.53	1531.99	272.40	889.19
20200211	1184.5	169.82	1077.95	1293.30	16.62	1530.97	273.90	891.20
20200210	1187.0	170.04	1081.40	1299.88	16.64	1534.55	274.73	893.15
20200207	1193.5	170.50	1087.17	1307.06	16.71	1543.08	278.70	896.66
20200206	1184.0	169.85	1078.28	1302.87	16.63	1535.59	279.56	891.50

100 rows × 8 columns

1
2
3

kor_vs_cur = pd.concat([df_kor, cur], axis=1, ignore_index=True)
kor_vs_cur.columns = ['kospi', 'kosdaq', 'usd', 'cny', 'jpy', 'eur', 'inr', 'gbp', 'brl', 'cad']
kor_vs_cur.tail()

	kospi	kosdaq	usd	cny	jpy	eur	inr	gbp	brl	cad
dt
20200212	2238.38	686.59	1180.0	169.27	1072.97	1288.21	16.53	1531.99	272.40	889.19
20200211	2223.12	682.34	1184.5	169.82	1077.95	1293.30	16.62	1530.97	273.90	891.20
20200210	2201.07	676.07	1187.0	170.04	1081.40	1299.88	16.64	1534.55	274.73	893.15
20200207	2211.95	672.63	1193.5	170.50	1087.17	1307.06	16.71	1543.08	278.70	896.66
20200206	2227.94	672.69	1184.0	169.85	1078.28	1302.87	16.63	1535.59	279.56	891.50

상관관계 확인을 위해 그래프 그리기

1	from sklearn.preprocessing import minmax_scale

plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["gbp"]), "go--", label="gbp")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["brl"]), "y*:", label="brl")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cad"]), "g*:", label="cad")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

상관계수 확인 - KOSPI

#상관계수 확인 - "KOSPI"
print("KOSPI vs KOSDAQ : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["kosdaq"])[0, 1]))
print("KOSPI vs USDKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["usd"])[0, 1]))
print("KOSPI vs CNYKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["cny"])[0, 1]))
print("KOSPI vs JPYKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["jpy"])[0, 1]))
print("KOSPI vs EURKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["eur"])[0, 1]))
print("KOSPI vs INRKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["inr"])[0, 1]))
print("KOSPI vs GBPKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["gbp"])[0, 1]))
print("KOSPI vs BRLKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["brl"])[0, 1]))
print("KOSPI vs CADKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["cad"])[0, 1]))

KOSPI vs KOSDAQ : 0.8689701457832764
KOSPI vs USDKRW : -0.7474526625164396
KOSPI vs CNYKRW : -0.8299251745227267
KOSPI vs JPYKRW : -0.6105987487442344
KOSPI vs EURKRW : -0.2986694402889244
KOSPI vs INRKRW : -0.09305307626089275
KOSPI vs GBPKRW : 0.4807979987407145
KOSPI vs BRLKRW : 0.267607616153946
KOSPI vs CADKRW : 0.6836357941585803

KOSPI - Negative Correlation

plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

상관계수 확인 - “KOSDAQ”

print("KOSDAQ vs KOSDAQ : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["kospi"])[0, 1]))
print("KOSDAQ vs USDKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["usd"])[0, 1]))
print("KOSDAQ vs CNYKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["cny"])[0, 1]))
print("KOSDAQ vs JPYKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["jpy"])[0, 1]))
print("KOSDAQ vs EURKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["eur"])[0, 1]))
print("KOSDAQ vs INRKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["inr"])[0, 1]))
print("KOSDAQ vs GBPKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["gbp"])[0, 1]))
print("KOSDAQ vs BRLKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["brl"])[0, 1]))
print("KOSDAQ vs CADKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["cad"])[0, 1]))

KOSDAQ vs KOSDAQ : 0.8689701457832764
KOSDAQ vs USDKRW : -0.4560788590919353
KOSDAQ vs CNYKRW : -0.7171914703642415
KOSDAQ vs JPYKRW : -0.24811709011626565
KOSDAQ vs EURKRW : -0.019148376695966446
KOSDAQ vs INRKRW : -0.3900569760483904
KOSDAQ vs GBPKRW : 0.21872439893983708
KOSDAQ vs BRLKRW : -0.21073197177622852
KOSDAQ vs CADKRW : 0.49393118449595613

KOSDAQ - Negative Correlation

plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["brl"]), "y*:", label="brl")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

상관계수 확인 결과, 강한 상관관계를 보인 항목들은 아래와 같다.

KOSPI vs USDKRW : -0.7474526625164396
KOSPI vs CNYKRW : -0.8299251745227267
KOSPI vs JPYKRW : -0.6105987487442344
KOSDAQ vs CNYKRW : -0.7171914703642415

코스닥이 오를 때에는 달러와 위안화를 사두면 좋을 것 같다.

# # KOSPI & KOSDAQ - Negative Correlation
plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

추가 상관계수 확인 - “USD”

확실히 KOSPI나 KOSDAQ과 비교했을 때 전체적으로 상관계수들이 비교적 높다
USD 상관계수 절대값 0.4 미만 = 2개
KOSPI 상관계수 절대값 0.4 미만 = 3개
KOSDAQ 상관계수 절대값 0.4 미만 = 5개

달러의 움직임이 전세계에 영향을 미친다는 것을 알 수 있다.

print("USD vs KOSPI : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["kospi"])[0, 1]))
print("USD vs KOSDAQ : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["kosdaq"])[0, 1]))
print("USD vs CNYKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["cny"])[0, 1]))
print("USD vs JPYKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["jpy"])[0, 1]))
print("USD vs EURKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["eur"])[0, 1]))
print("USD vs INRKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["inr"])[0, 1]))
print("USD vs GBPKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["gbp"])[0, 1]))
print("USD vs BRLKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["brl"])[0, 1]))
print("USD vs CADKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["cad"])[0, 1]))

USD vs KOSPI : -0.7474526625164396
USD vs KOSDAQ : -0.45607885909193524
USD vs CNYKRW : 0.8717121292793995
USD vs JPYKRW : 0.7276336988139611
USD vs EURKRW : 0.47570951394792504
USD vs INRKRW : 0.13242540153627583
USD vs GBPKRW : -0.45775893555193975
USD vs BRLKRW : -0.4943384202903821
USD vs CADKRW : -0.3276584506393807

plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["gbp"]), "go--", label="gbp")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["brl"]), "y*:", label="brl")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cad"]), "g*:", label="cad")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

추후

NASDAQ과 미국 주요 지수들 그리고 미국 포브스 선정 100대 기업들의 상관관계
PMI 지수와 전세계 100대 기업들의 상관관계

Apache Spark와 Hadoop 공부 시작

Apache Spark

Apache Spark가 scala로 되어있어서 api가 scala로 제일 먼저 나옴
Apache Spark는 빅데이터를 처리하기 위한 sw platform
- 빅데이터란?
  - 한 컴퓨터로 처리할 수 없는 양의 데이터
Apache Spark는 기본적으로 여러 대의 서버를 활용
Apache Spark는 여러대의 서버를 한대에서 실행하는 것처럼 작업할 수 있도록 해줌
Apahce Spark의 엔진이 Hash방법으로 알아서 여러대로 분산을 시켜줌

Hadoop

HDFS (Hadoop Distributed File System) - Google File System 논문을 base로 open-source로 만든 file system
MR
- MapReduce - Hadoop이 나온지 얼마되지 않았을 때는 Map과 Reduce만 가능
- 지금은 Hadoop내에서 Spark연산이 가능
YARN - Yet Another Resource Negotiator : Memory와 CPU를 위해서 싸우는 것들의 협상자
- ‘Mem 얼마, CPU 얼마가 필요해’ says App –> ‘Server x, y를 가서 쓰렴’ says YARN
- 더 이상 할당해줄 수 있는 memory와 cpu가 없으면, pending 시킴
내부적으로 어떤 식으로 data 작업이 실행이 되었는지 시각화를 해줌
default로 3개의 복사본을 자동으로 만들어줌으로 데이터 손실을 막아줌

엄청 어려워보이지만 잘 지내보자 :)

20200612

Posted on 2020-06-12 Edited on 2020-06-17

인스타그램 크롤링 후, 검색 기능을 잃다

결론부터 말하자면,

내 인스타그램은 현재 검색기능이 작동하지 않는다.

현재 나는 회귀분석 프로젝트를 하고 있다.

프로젝트를 위해서 여러가지 데이터를 모으던 중,

인스타그램의 데이터가 필요해 데이터 크롤러를 만들게 되었다.

인스타그램은 로그인 없이 2번까지 검색을 할 수 있게 해준다.

하지만 긁어야하는 데이터가 500개였기에 나는 로그인을 자동으로 하는 기능을 추가하게 되었다.

그러던 중, 인스타그램에서 몇 번 이상 검색을 하게 되면 위와 같은 오류페이지를 던진다는 것을 알게 되었다.

20~30번까지는 문제 없이 수집하다가 위와 같은 오류페이지가 뜨는 것을 확인하였고,

안전하게 10번씩 끊어서 하도록 설정해서 했너니 문제가 생기지 않았다.

생각했던 것보다 뭔가 일이 수월하게 풀리는 것 같았다.

잠시 뿌듯한 마음을 만끽하는 듯 했지만…

인스타그램은 역시 인스타그램이었고,

대기업이었고 호락호락한 상대가 아니었다.

크롤링이 무난하게 180개를 넘어서더니, 1차 오류가 발생하였다.

지정해줬던 특정 정보를 잘 가지고 오던 크롤러는 검색 결과로 내용이 없는 데이터를 수집하기 시작했다. 내가 원하는 데이터의 위치가 인스타그램 페이지 내부에서 바뀌었던 것이다.

이 상태로 200개를 더 검색했다.

어차피 자동으로 돌아가는 코드였기에 신경을 안쓰다 확인을 한 결과, 인스타그램은 내 계정으로 실행되던 코드가 자동화된 코드라는 것을 인지 하였고, 내 계정으로는 더 이상 검색화면이 들어가지지 않았다.

심지어 내 계정 페이지도 들어가지지 않았다.

나중에 찾아보니 정책에 인스타그램 내에서의 크롤링은 금지되어 있었다.

~~(아마 알았으면 내 아이디 안들어가는 크롤러를 만들었겠지… 뭐 아예 안 만들었을 수도 있고…)~~

정책의 존재를 모르고, 크롤링을 했고, 결국 내 인스타는 검색 기능을 잃었고…

결론은….

인스타그램님, 너가 최고인거 잘 알았고,

정말 다시는 절대로 아마 안할테니까

나랑 거리두기 그만하고

빨리 내 계정 풀어주세요

EDA Practice on House Price Dataset (Kaggle)

Posted on 2020-04-28 Edited on 2020-06-17

https://www.kaggle.com/ekami66/detailed-exploratory-data-analysis-with-python/data#Exploratory-data-analysis-(EDA)

Note: I do not own the codes below. This is identical to the codes from the above URL and is just used for practicing EDA.

Preparations

1	plt.style.use('bmh')

1 2	df = pd.read_csv('train.csv') df.head()

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB

# df.count() does not include NaN values
# remove Id and the features with 30% or less NaN values
df2 = df[[column for column in df if df[column].count() / len(df) >= 0.3]]
del df2['Id']
print("List of dropped columns:", end=" ")
for c in df.columns:
    if c not in df2.columns:
        print(c, end=", ")

print('\n')
df = df2

List of dropped columns: Id, Alley, PoolQC, Fence, MiscFeature,

# the distribution of the housing price
print(df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

From the above graph, it can be deduced that there are outliers existing above $500,000. These outliers will be deleted to get a normal distribution of the independent variabl ('SalePrice') for machine learning <= (not sure what this means)

Numerical Data Distribution

1	list(set(df.dtypes.tolist()))

[dtype('O'), dtype('float64'), dtype('int64')]

1 2	df_num = df.select_dtypes(include = ['float64', 'int64']) df_num.head()

	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	MoSold	YrSold	SalePrice
0	60	65.0	8450	7	5	2003	2003	196.0	706	...	0	61	0	2	2008	208500
1	20	80.0	9600	6	8	1976	1976	0.0	978	...	298	0	0	5	2007	181500
2	60	68.0	11250	7	5	2001	2002	162.0	486	...	0	42	0	9	2008	223500
3	70	60.0	9550	7	5	1915	1970	0.0	216	...	0	35	272	2	2006	140000
4	60	84.0	14260	8	5	2000	2000	350.0	655	...	192	84	0	12	2008	250000

5 rows × 37 columns

Plot them all:

1	df_num.hist(figsize=(15, 20), bins=50, xlabelsize=8, ylabelsize=8);

“Features such as 1stFlrSF, TotalBsmtSF, LotFrontage, GrLiveArea… seems to share a similar distribution to the one we have with SalePrice. Lets see if we can find new clues later.”

At this point, I suddenly came to wonder what are we trying to find through all this process? Why does it seem like the SalePrice is the center of the whole thing.

1
2
3

df_num_corr = df_num.corr()['SalePrice'][:-1] # -1 because the last row is SalePrice
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("The following are the top {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

The following are the top 10 strongly correlated values with SalePrice:
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
Name: SalePrice, dtype: float64

From the table above, we now know which data has the strongest relationship with the SalePrice. But, this data is still incomplete as the outliers still exist in the dataset.

To get rid of the outliers the following measures can be taken:

1
2
3

1. Plot the numerical features and see which ones have very few or explainable outliers

2. Remove the outliers from these features and see which one can have a good correlation without their outliers

NOTE 1

Correlation by itself does not always explain the relationship between data
Plotting data could lead to new insights

THEREFORE ALWAYS VISUALIZE THE DATA

NOTE 2

Through the correlation value,the curvilinear relationship cannot be deduced.

SO ALWAYS VISUALIZE THE DATA IN NUMEROUS WAYS TO GET MORE INSIGHTS

For example, relationships such as curvilinear relationship cannot be guessed just by looking at the correlation value so lets take the features we excluded from our correlation table and plot them to see if they show some kind of pattern.

Not sure yet what features were excluded from the correlation table.

for i in range(0, len(df_num.columns), 5):
    sns.pairplot(data=df_num,
                 x_vars=df_num.columns[i:i+5],
                 y_vars=['SalePrice']
                )

Deduction:

- Many data seem to have a linear relationship with the SalePrice
    - A lot of data points are located on x = 0 
  Possible indication of absence of such features in the house)

More Data Cleaning

Removal of 0 values and repeat the process of finding correlated values

import operator

individual_features_df = []
for i in range(0, len(df_num.columns) - 1):
# -1 because the last column is SalePrice
    tmpDf = df_num[[df_num.columns[i], 'SalePrice']]
    tmpDf = tmpDf[tmpDf[df_num.columns[i]] != 0]
    individual_features_df.append(tmpDf)
    
all_correlations = {feature.columns[0]: feature.corr()['SalePrice'][0] for feature in individual_features_df}
all_correlations = sorted(all_correlations.items(), key=operator.itemgetter(1))

for (key, value) in all_correlations:
    print("{:>15}: {:>15}".format(key, value))

 KitchenAbvGr: -0.13920069217785566
     HalfBath: -0.08439171127179887
   MSSubClass: -0.08428413512659523
  OverallCond: -0.0778558940486776
       YrSold: -0.028922585168730426
 BsmtHalfBath: -0.028834567185481712
     PoolArea: -0.014091521506356928
 BsmtFullBath: 0.011439163340408634
       MoSold: 0.04643224522381936
    3SsnPorch: 0.06393243256889079
  OpenPorchSF: 0.08645298857147708
      MiscVal: 0.08896338917298924
   Fireplaces: 0.1216605842136395
    BsmtUnfSF: 0.16926100049514192
 BedroomAbvGr: 0.18093669310849045
   WoodDeckSF: 0.19370601237520677
   BsmtFinSF2: 0.19895609430836586
EnclosedPorch: 0.2412788363011751
  ScreenPorch: 0.25543007954878405
      LotArea: 0.2638433538714063
 LowQualFinSF: 0.3000750165550133
  LotFrontage: 0.35179909657067854
   MasVnrArea: 0.4340902197568926
   BsmtFinSF1: 0.4716904265235731
  GarageYrBlt: 0.48636167748786213
 YearRemodAdd: 0.5071009671113867
    YearBuilt: 0.5228973328794967
 TotRmsAbvGrd: 0.5337231555820238
     FullBath: 0.5745626737760816
     1stFlrSF: 0.6058521846919166
   GarageArea: 0.6084052829168343
  TotalBsmtSF: 0.6096808188074366
   GarageCars: 0.6370954062078953
     2ndFlrSF: 0.6733048324568383
    GrLivArea: 0.7086244776126511
  OverallQual: 0.7909816005838047

Conclusion

The most strongly correlated values are as follows in the golden_features_list.

1
2

golden_features_list = [key for key, value in all_correlations if abs(value) >= 0.5]
print("Following are the top {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

Following are the top 11 strongly correlated values with SalePrice:
['YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GarageCars', '2ndFlrSF', 'GrLivArea', 'OverallQual']

Feature to Feature Relationship

corr = df_num.drop('SalePrice', axis=1).corr()
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Q –> Q (Quantitative to Quantitative relationship)

quantitative_features_list = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF',
    '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']

df_quantitative_values = df[quantitative_features_list]
df_quantitative_values.head()

	LotFrontage	LotArea	MasVnrArea	BsmtFinSF1	TotalBsmtSF	1stFlrSF	2ndFlrSF	GrLivArea	...	GarageCars	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	SalePrice
0	65.0	8450	196.0	706	856	856	854	1710	...	2	548	0	61	0	208500
1	80.0	9600	0.0	978	1262	1262	0	1262	...	2	460	298	0	0	181500
2	68.0	11250	162.0	486	920	920	866	1786	...	2	608	0	42	0	223500
3	60.0	9550	0.0	216	756	961	756	1717	...	3	642	0	35	272	140000
4	84.0	14260	350.0	655	1145	1145	1053	2198	...	3	836	192	84	0	250000

5 rows × 28 columns

1
2
3

features_to_analyse = [x for x in quantitative_features_list if x in golden_features_list]
features_to_analyse.append('SalePrice')
features_to_analyse

['TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea',
 'SalePrice']

The Distribution

fig, ax = plt.subplots(round(len(features_to_analyse) / 3), 3, figsize= (18, 12))

for i, ax in enumerate(fig.axes):
    if i < len(features_to_analyse) - 1:
        sns.regplot(x = features_to_analyse[i], y='SalePrice', data=df[features_to_analyse], ax=ax)

We can see that features such as TotalBsmtSF, 1stFlrSF, GrLivArea have a big spread but I cannot tell what insights this information gives us

C -> Q (Categorical to Quantitative Relationship)

# quantitative_features_list[:-1] as the last column is SalePrice and we want to keep it
categorical_features = [a for a in quantitative_features_list[:-1] + df.columns.tolist() if (a not in quantitative_features_list[:-1]) or (a not in df.columns.tolist())]
df_categ = df[categorical_features]
df_categ.head()

	MSSubClass	MSZoning	Street	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	...	GarageYrBlt	GarageFinish	GarageQual	GarageCond	PavedDrive	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	60	RL	Pave	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	...	2003.0	RFn	TA	TA	Y	2	2008	WD	Normal	208500
1	20	RL	Pave	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	...	1976.0	RFn	TA	TA	Y	5	2007	WD	Normal	181500
2	60	RL	Pave	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	...	2001.0	RFn	TA	TA	Y	9	2008	WD	Normal	223500
3	70	RL	Pave	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	...	1998.0	Unf	TA	TA	Y	2	2006	WD	Abnorml	140000
4	60	RL	Pave	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	...	2000.0	RFn	TA	TA	Y	12	2008	WD	Normal	250000

5 rows × 49 columns

non-numerical features

1 2	df_not_num = df_categ.select_dtypes(include = ['O']) # Object(O) print('There are {} non numerical features including:\n{}'.format(len(df_not_num.columns), df_not_num.columns.tolist()))

There are 39 non numerical features including:
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

“Looking at these features we can see that a lot of them are of the type Object(O). In our data transformation notebook we could use Pandas categorical functions (equivalent to R’s factor) to shape our data in a way that would be interpretable for our machine learning algorithm. ExterQual for instace could be transformed to an ordered categorical object.”

Definitely need more study on this especially on the transformation of data

plt.figure(figsize= (10, 6))
ax = sns.boxplot(x='BsmtExposure', y='SalePrice', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

(array([0, 1, 2, 3]), <a list of 4 Text xticklabel objects>)

plt.figure(figsize= (12, 6))
ax = sns.boxplot(x='SaleCondition', y='SalePrice', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)

fig, axes = plt.subplots(round(len(df_not_num.columns) / 3), 3, figsize=(12, 30))

for i, ax in enumerate(fig.axes):
    if i < len(df_not_num.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.countplot(x=df_not_num.columns[i], alpha=0.7, data=df_not_num, ax=ax)
        
fig.tight_layout()

Deduction

Some categories are predominant for some features
e.g. Utilities, Heating, etc.

Hello World

Posted on 2020-04-17

Welcome to Hexo! This is your very first post. Check documentation for more info. If you get any problems when using Hexo, you can find the answer in troubleshooting or you can ask me on GitHub.

Quick Start

Create a new post

1	$ hexo new "My New Post"

More info: Writing

Run server

1	$ hexo server

More info: Server

Generate static files

1	$ hexo generate

More info: Generating

Deploy to remote sites

1	$ hexo deploy

More info: Deployment