0%

GDP 상위 10개국의 환율과 GDP 11위인 대한민국의 KOSPI/KOSDAQ 상관관계


데이터 수집

  • 네이버 증권 페이지에서 GDP 상위 10개국 환율과 대한민국 KOSPI/KOSDAQ 크롤링
  • 100일 분량

KOSPI

1
import requests
1
2
3
4
5
def get_stock_data(code, page_size=100, page=1):
url = "https://m.stock.naver.com/api/json/sise/dailySiseIndexListJson.nhn?code={}&pageSize={}&page={}".format(code, page_size, page)
response = requests.get(url)
datas = response.json()['result']['siseList']
return pd.DataFrame(datas)
1
2
kosdaq_df = get_stock_data("KOSDAQ")
kosdaq_df.tail(2)

cd dt ncv cv cr ov hv lv
98 KOSDAQ 20200207 672.63 -0.06 -0.01 673.11 674.03 667.93
99 KOSDAQ 20200206 672.69 11.37 1.72 665.94 672.69 665.88
1
2
kospi_df = get_stock_data("KOSPI")
kospi_df.tail(2)

cd dt ncv cv cr ov hv lv
98 KOSPI 20200207 2211.95 -15.99 -0.72 2221.49 2223.92 2198.14
99 KOSPI 20200206 2227.94 62.31 2.88 2184.77 2229.99 2182.25
1
2
3
4
5
df_kor = pd.merge(kospi_df, kosdaq_df, left_on="dt", right_on="dt")
df_kor = df_kor.rename(columns={"ncv_x": "kospi", "ncv_y": "kosdaq"})
df_kor = df_kor[["dt", "kospi", "kosdaq"]]
df_kor = df_kor.set_index('dt')
df_kor.tail(2)

kospi kosdaq
dt
20200207 2211.95 672.63
20200206 2227.94 672.69

환율

1
2
3
4
5
6
7
8
9
10
11
12
def get_currency_data(code, page_size=100, page=1):
url = "https://m.stock.naver.com/api/json/marketindex/marketIndexDay.nhn?marketIndexCd={}&pageSize={}&page={}".format(code, page_size, page)
params = {
"cookie": "NNB=ITGWQQXTH6IV4; NRTK=ag#all_gr#1_ma#-2_si#0_en#0_sp#0; ASID=d3db6cfa0000017292952d4c00006e70; nx_ssl=2; notice_new_account_171101=1; MM_NEW=1; NFS=2; MM_NOW_COACH=1; page_uid=UYoLesprvh8ssZHPceGssssssLl-298586; nid_inf=-1468512193; NID_AUT=RzrySnbsg4az0Z1kbSBCoLIC+oPS+1qfJCrnfq+S6eQVpRQOk2a/c8WWUJySQubY; NID_JKL=mueUVlznsILAKlGNEEOZL2gSa7GClv2m/wy+sWHeJP8=; NID_SES=AAABf7yRp/MIC8NRjyN8FNZm5ZIzE6EmXPjzoT8h5j5Tk6qVf38M4x3lGhbka33XB1/8dRoxW10UNBGHVSCOLiorv+O/Huw4fvZ6wkg8sXn6GD2QoxhPliJuh2e6WWXKTRRQq+pQRa/T1glGGsCR4PUMM7OlQ/p7ORm9ByQSdvLKvXbBBk0b6l/YSPZugdtVi44zKevugHN6UhOa/YfPNVa/ppkZ7MUh9Z2PK9F7SL1u7DmKpHAPREqDOgadvt7BPCWGhzmpZLycXPnMxsP4c3+Vf9u4Uw87TGtst/0xbetORWritqcMQV+9UW+zmb+pEnxBhLLQUEYp8jBtu8NRN/wCRj++ryNMaVdk5/uOdRcImcjbRlYuZ6xQsF+l2f6L470CNqlM9SJKFEZ9N/0X2yk+al75Jmqgpxw6WjmcX+G2lqoyobjBja0LzXW/Eks/EqatKybMe2DrqFjgpW1EKwjb180PaqehezYvyRRe2QSlMmzxe1FGhZNUfok90ZXGhp5yKQ==; XSRF-TOKEN=ca183283-9562-4e61-817f-db7653bb0636; JSESSIONID=767616970B529D565974D21FBE89A761; BMR="
}
response = requests.get(url, headers=params)
datas = response.json()['result']['marketIndexDay']
df = pd.DataFrame(datas)
df = df[["dt", "nv"]]
exec('df.columns = ["dt", "{}_nv"]'.format(code))
df = df.set_index('dt')
return df
1
2
3
# 미국
usd_df = get_currency_data("FX_USDKRW")
#usd_df.tail(2)
1
2
3
# 중국
cny_df = get_currency_data("FX_CNYKRW")
# cny_df.tail(2)
1
2
3
# 일본
jpy_df = get_currency_data("FX_JPYKRW")
# jpy_df.tail(2)
1
2
3
# 유로 (독일, 프랑스, 이탈리아)
eur_df = get_currency_data("FX_EURKRW")
# eur_df.tail(2)
1
2
3
# 인도
inr_df = get_currency_data("FX_INRKRW")
# inr_df.tail(2)
1
2
3
# 영국
gbp_df = get_currency_data("FX_GBPKRW")
# gbp_df.tail(2)
1
2
3
# 브라질
brl_df = get_currency_data("FX_BRLKRW")
# brl_df.tail(2)
1
2
3
# 캐나다
cad_df = get_currency_data("FX_CADKRW")
# cad_df.tail(2)

데이터 정리

1
2
3
cur = pd.concat([usd_df, cny_df, jpy_df, eur_df, inr_df, gbp_df, brl_df, cad_df], axis=1, ignore_index=True)
cur.columns = ['usd', 'cny', 'jpy', 'eur', 'inr', 'gbp', 'brl', 'cad']
cur

usd cny jpy eur inr gbp brl cad
dt
20200630 1202.3 170.01 1115.88 1348.56 15.92 1476.48 222.50 878.59
20200629 1200.5 169.78 1120.03 1354.34 15.90 1479.98 218.84 879.00
20200626 1200.0 169.41 1123.02 1347.48 15.87 1489.86 223.85 879.96
20200625 1203.5 169.94 1122.20 1349.42 15.92 1497.52 225.03 883.69
20200624 1202.5 170.27 1128.63 1359.55 15.90 1505.89 233.38 886.41
... ... ... ... ... ... ... ... ...
20200212 1180.0 169.27 1072.97 1288.21 16.53 1531.99 272.40 889.19
20200211 1184.5 169.82 1077.95 1293.30 16.62 1530.97 273.90 891.20
20200210 1187.0 170.04 1081.40 1299.88 16.64 1534.55 274.73 893.15
20200207 1193.5 170.50 1087.17 1307.06 16.71 1543.08 278.70 896.66
20200206 1184.0 169.85 1078.28 1302.87 16.63 1535.59 279.56 891.50

100 rows × 8 columns

1
2
3
kor_vs_cur = pd.concat([df_kor, cur], axis=1, ignore_index=True)
kor_vs_cur.columns = ['kospi', 'kosdaq', 'usd', 'cny', 'jpy', 'eur', 'inr', 'gbp', 'brl', 'cad']
kor_vs_cur.tail()

kospi kosdaq usd cny jpy eur inr gbp brl cad
dt
20200212 2238.38 686.59 1180.0 169.27 1072.97 1288.21 16.53 1531.99 272.40 889.19
20200211 2223.12 682.34 1184.5 169.82 1077.95 1293.30 16.62 1530.97 273.90 891.20
20200210 2201.07 676.07 1187.0 170.04 1081.40 1299.88 16.64 1534.55 274.73 893.15
20200207 2211.95 672.63 1193.5 170.50 1087.17 1307.06 16.71 1543.08 278.70 896.66
20200206 2227.94 672.69 1184.0 169.85 1078.28 1302.87 16.63 1535.59 279.56 891.50

상관관계 확인을 위해 그래프 그리기

1
from sklearn.preprocessing import minmax_scale
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["gbp"]), "go--", label="gbp")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["brl"]), "y*:", label="brl")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cad"]), "g*:", label="cad")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

output_26_0


상관계수 확인 - KOSPI

1
2
3
4
5
6
7
8
9
10
#상관계수 확인 - "KOSPI"
print("KOSPI vs KOSDAQ : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["kosdaq"])[0, 1]))
print("KOSPI vs USDKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["usd"])[0, 1]))
print("KOSPI vs CNYKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["cny"])[0, 1]))
print("KOSPI vs JPYKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["jpy"])[0, 1]))
print("KOSPI vs EURKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["eur"])[0, 1]))
print("KOSPI vs INRKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["inr"])[0, 1]))
print("KOSPI vs GBPKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["gbp"])[0, 1]))
print("KOSPI vs BRLKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["brl"])[0, 1]))
print("KOSPI vs CADKRW : {}".format(np.corrcoef(kor_vs_cur["kospi"], kor_vs_cur["cad"])[0, 1]))
KOSPI vs KOSDAQ : 0.8689701457832764
KOSPI vs USDKRW : -0.7474526625164396
KOSPI vs CNYKRW : -0.8299251745227267
KOSPI vs JPYKRW : -0.6105987487442344
KOSPI vs EURKRW : -0.2986694402889244
KOSPI vs INRKRW : -0.09305307626089275
KOSPI vs GBPKRW : 0.4807979987407145
KOSPI vs BRLKRW : 0.267607616153946
KOSPI vs CADKRW : 0.6836357941585803

KOSPI - Negative Correlation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

output_31_0


상관계수 확인 - “KOSDAQ”

1
2
3
4
5
6
7
8
9
print("KOSDAQ vs KOSDAQ : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["kospi"])[0, 1]))
print("KOSDAQ vs USDKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["usd"])[0, 1]))
print("KOSDAQ vs CNYKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["cny"])[0, 1]))
print("KOSDAQ vs JPYKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["jpy"])[0, 1]))
print("KOSDAQ vs EURKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["eur"])[0, 1]))
print("KOSDAQ vs INRKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["inr"])[0, 1]))
print("KOSDAQ vs GBPKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["gbp"])[0, 1]))
print("KOSDAQ vs BRLKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["brl"])[0, 1]))
print("KOSDAQ vs CADKRW : {}".format(np.corrcoef(kor_vs_cur["kosdaq"], kor_vs_cur["cad"])[0, 1]))
KOSDAQ vs KOSDAQ : 0.8689701457832764
KOSDAQ vs USDKRW : -0.4560788590919353
KOSDAQ vs CNYKRW : -0.7171914703642415
KOSDAQ vs JPYKRW : -0.24811709011626565
KOSDAQ vs EURKRW : -0.019148376695966446
KOSDAQ vs INRKRW : -0.3900569760483904
KOSDAQ vs GBPKRW : 0.21872439893983708
KOSDAQ vs BRLKRW : -0.21073197177622852
KOSDAQ vs CADKRW : 0.49393118449595613

KOSDAQ - Negative Correlation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["brl"]), "y*:", label="brl")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

output_36_0



상관계수 확인 결과, 강한 상관관계를 보인 항목들은 아래와 같다.

  • KOSPI vs USDKRW : -0.7474526625164396
  • KOSPI vs CNYKRW : -0.8299251745227267
  • KOSPI vs JPYKRW : -0.6105987487442344
  • KOSDAQ vs CNYKRW : -0.7171914703642415

코스닥이 오를 때에는 달러와 위안화를 사두면 좋을 것 같다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# # KOSPI & KOSDAQ - Negative Correlation
plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

output_40_0


추가 상관계수 확인 - “USD”

  • 확실히 KOSPI나 KOSDAQ과 비교했을 때 전체적으로 상관계수들이 비교적 높다
  • USD 상관계수 절대값 0.4 미만 = 2개
  • KOSPI 상관계수 절대값 0.4 미만 = 3개
  • KOSDAQ 상관계수 절대값 0.4 미만 = 5개

달러의 움직임이 전세계에 영향을 미친다는 것을 알 수 있다.

1
2
3
4
5
6
7
8
9
print("USD vs KOSPI : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["kospi"])[0, 1]))
print("USD vs KOSDAQ : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["kosdaq"])[0, 1]))
print("USD vs CNYKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["cny"])[0, 1]))
print("USD vs JPYKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["jpy"])[0, 1]))
print("USD vs EURKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["eur"])[0, 1]))
print("USD vs INRKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["inr"])[0, 1]))
print("USD vs GBPKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["gbp"])[0, 1]))
print("USD vs BRLKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["brl"])[0, 1]))
print("USD vs CADKRW : {}".format(np.corrcoef(kor_vs_cur["usd"], kor_vs_cur["cad"])[0, 1]))
USD vs KOSPI : -0.7474526625164396
USD vs KOSDAQ : -0.45607885909193524
USD vs CNYKRW : 0.8717121292793995
USD vs JPYKRW : 0.7276336988139611
USD vs EURKRW : 0.47570951394792504
USD vs INRKRW : 0.13242540153627583
USD vs GBPKRW : -0.45775893555193975
USD vs BRLKRW : -0.4943384202903821
USD vs CADKRW : -0.3276584506393807
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
plt.figure(figsize=(40, 10))
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kospi"]), "r*:", label="kospi")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["kosdaq"]), "bo--", label="kosdaq")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["usd"]), "g^-", label="usd")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cny"]), "y^-", label="cny")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["jpy"]), "k--", label="jpy")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["eur"]), "m*:", label="eur")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["inr"]), "c^-", label="inr")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["gbp"]), "go--", label="gbp")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["brl"]), "y*:", label="brl")
plt.plot(kor_vs_cur.index, minmax_scale(kor_vs_cur["cad"]), "g*:", label="cad")

# label 데이터
plt.legend(loc=0)

# xtick이 항상 10개 출력되도록 - 아래 x축 label
step = len(kor_vs_cur) // 10
plt.xticks(kor_vs_cur.index[::step])

plt.show()

output_44_0


추후

  • NASDAQ과 미국 주요 지수들 그리고 미국 포브스 선정 100대 기업들의 상관관계
  • PMI 지수와 전세계 100대 기업들의 상관관계

Apache Spark와 Hadoop 공부 시작


image

Apache Spark

  • Apache Spark가 scala로 되어있어서 api가 scala로 제일 먼저 나옴
  • Apache Spark는 빅데이터를 처리하기 위한 sw platform
    • 빅데이터란?
      • 한 컴퓨터로 처리할 수 없는 양의 데이터
  • Apache Spark는 기본적으로 여러 대의 서버를 활용
  • Apache Spark는 여러대의 서버를 한대에서 실행하는 것처럼 작업할 수 있도록 해줌
  • Apahce Spark의 엔진이 Hash방법으로 알아서 여러대로 분산을 시켜줌

image



image

Hadoop

  • HDFS (Hadoop Distributed File System) - Google File System 논문을 base로 open-source로 만든 file system
  • MR
    • MapReduce - Hadoop이 나온지 얼마되지 않았을 때는 Map과 Reduce만 가능
    • 지금은 Hadoop내에서 Spark연산이 가능
  • YARN - Yet Another Resource Negotiator : Memory와 CPU를 위해서 싸우는 것들의 협상자
    • ‘Mem 얼마, CPU 얼마가 필요해’ says App –> ‘Server x, y를 가서 쓰렴’ says YARN
    • 더 이상 할당해줄 수 있는 memory와 cpu가 없으면, pending 시킴
  • 내부적으로 어떤 식으로 data 작업이 실행이 되었는지 시각화를 해줌
  • default로 3개의 복사본을 자동으로 만들어줌으로 데이터 손실을 막아줌

image


image


엄청 어려워보이지만 잘 지내보자 :)

인스타그램 크롤링 후, 검색 기능을 잃다


결론부터 말하자면,

내 인스타그램은 현재 검색기능이 작동하지 않는다.

KakaoTalk_20200610_172703446


현재 나는 회귀분석 프로젝트를 하고 있다.


프로젝트를 위해서 여러가지 데이터를 모으던 중,

인스타그램의 데이터가 필요해 데이터 크롤러를 만들게 되었다.

ezgif com-optimize (1)


인스타그램은 로그인 없이 2번까지 검색을 할 수 있게 해준다.

하지만 긁어야하는 데이터가 500개였기에 나는 로그인을 자동으로 하는 기능을 추가하게 되었다.


그러던 중, 인스타그램에서 몇 번 이상 검색을 하게 되면 위와 같은 오류페이지를 던진다는 것을 알게 되었다.


20~30번까지는 문제 없이 수집하다가 위와 같은 오류페이지가 뜨는 것을 확인하였고,

안전하게 10번씩 끊어서 하도록 설정해서 했너니 문제가 생기지 않았다.


생각했던 것보다 뭔가 일이 수월하게 풀리는 것 같았다.

KakaoTalk_20200612_101638676


잠시 뿌듯한 마음을 만끽하는 듯 했지만…

KakaoTalk_20200612_101638002


인스타그램은 역시 인스타그램이었고,

대기업이었고 호락호락한 상대가 아니었다.


KakaoTalk_20200612_102724280

크롤링이 무난하게 180개를 넘어서더니, 1차 오류가 발생하였다.

지정해줬던 특정 정보를 잘 가지고 오던 크롤러는 검색 결과로 내용이 없는 데이터를 수집하기 시작했다. 내가 원하는 데이터의 위치가 인스타그램 페이지 내부에서 바뀌었던 것이다.

이 상태로 200개를 더 검색했다.


어차피 자동으로 돌아가는 코드였기에 신경을 안쓰다 확인을 한 결과, 인스타그램은 내 계정으로 실행되던 코드가 자동화된 코드라는 것을 인지 하였고, 내 계정으로는 더 이상 검색화면이 들어가지지 않았다.

심지어 내 계정 페이지도 들어가지지 않았다.


나중에 찾아보니 정책에 인스타그램 내에서의 크롤링은 금지되어 있었다.

(아마 알았으면 내 아이디 안들어가는 크롤러를 만들었겠지… 뭐 아예 안 만들었을 수도 있고…)

ig_policy_20200612



정책의 존재를 모르고, 크롤링을 했고, 결국 내 인스타는 검색 기능을 잃었고…


결론은….

인스타그램님, 너가 최고인거 잘 알았고,

KakaoTalk_20200612_102723942

정말 다시는 절대로 아마 안할테니까

KakaoTalk_20200612_102725042

나랑 거리두기 그만하고

빨리 내 계정 풀어주세요

KakaoTalk_20200612_101640732

https://www.kaggle.com/ekami66/detailed-exploratory-data-analysis-with-python/data#Exploratory-data-analysis-(EDA)

Note: I do not own the codes below. This is identical to the codes from the above URL and is just used for practicing EDA.


Preparations

1
plt.style.use('bmh')
1
2
df = pd.read_csv('train.csv')
df.head()

Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns


1
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
1
2
3
4
5
6
7
8
9
10
11
# df.count() does not include NaN values
# remove Id and the features with 30% or less NaN values
df2 = df[[column for column in df if df[column].count() / len(df) >= 0.3]]
del df2['Id']
print("List of dropped columns:", end=" ")
for c in df.columns:
if c not in df2.columns:
print(c, end=", ")

print('\n')
df = df2
List of dropped columns: Id, Alley, PoolQC, Fence, MiscFeature, 

1
2
3
4
# the distribution of the housing price
print(df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

EDA_Practice_-_HousePrices_6_1

From the above graph, it can be deduced that there are outliers existing above $500,000. These outliers will be deleted to get a normal distribution of the independent variabl ('SalePrice') for machine learning <= (not sure what this means)


Numerical Data Distribution

1
list(set(df.dtypes.tolist()))
[dtype('O'), dtype('float64'), dtype('int64')]
1
2
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
0 60 65.0 8450 7 5 2003 2003 196.0 706 0 ... 0 61 0 0 0 0 0 2 2008 208500
1 20 80.0 9600 6 8 1976 1976 0.0 978 0 ... 298 0 0 0 0 0 0 5 2007 181500
2 60 68.0 11250 7 5 2001 2002 162.0 486 0 ... 0 42 0 0 0 0 0 9 2008 223500
3 70 60.0 9550 7 5 1915 1970 0.0 216 0 ... 0 35 272 0 0 0 0 2 2006 140000
4 60 84.0 14260 8 5 2000 2000 350.0 655 0 ... 192 84 0 0 0 0 0 12 2008 250000

5 rows × 37 columns


Plot them all:

1
df_num.hist(figsize=(15, 20), bins=50, xlabelsize=8, ylabelsize=8);

EDA_Practice_-_HousePrices_12_0


“Features such as 1stFlrSF, TotalBsmtSF, LotFrontage, GrLiveArea… seems to share a similar distribution to the one we have with SalePrice. Lets see if we can find new clues later.”

At this point, I suddenly came to wonder what are we trying to find through all this process? Why does it seem like the SalePrice is the center of the whole thing.

1
2
3
df_num_corr = df_num.corr()['SalePrice'][:-1] # -1 because the last row is SalePrice
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("The following are the top {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))
The following are the top 10 strongly correlated values with SalePrice:
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
Name: SalePrice, dtype: float64

From the table above, we now know which data has the strongest relationship with the SalePrice. But, this data is still incomplete as the outliers still exist in the dataset.


To get rid of the outliers the following measures can be taken:

1
2
3
1. Plot the numerical features and see which ones have very few or explainable outliers

2. Remove the outliers from these features and see which one can have a good correlation without their outliers

NOTE 1

  • Correlation by itself does not always explain the relationship between data
  • Plotting data could lead to new insights

THEREFORE ALWAYS VISUALIZE THE DATA


NOTE 2

Through the correlation value,the curvilinear relationship cannot be deduced.

SO ALWAYS VISUALIZE THE DATA IN NUMEROUS WAYS TO GET MORE INSIGHTS

1
For example, relationships such as curvilinear relationship cannot be guessed just by looking at the correlation value so lets take the features we excluded from our correlation table and plot them to see if they show some kind of pattern.

Not sure yet what features were excluded from the correlation table.


1
2
3
4
5
for i in range(0, len(df_num.columns), 5):
sns.pairplot(data=df_num,
x_vars=df_num.columns[i:i+5],
y_vars=['SalePrice']
)

EDA_Practice_-_HousePrices_18_0

EDA_Practice_-_HousePrices_18_1

EDA_Practice_-_HousePrices_18_2

EDA_Practice_-_HousePrices_18_3

EDA_Practice_-_HousePrices_18_4

EDA_Practice_-_HousePrices_18_5

EDA_Practice_-_HousePrices_18_6

EDA_Practice_-_HousePrices_18_7


Deduction:

- Many data seem to have a linear relationship with the SalePrice
    - A lot of data points are located on x = 0 
  Possible indication of absence of such features in the house)

More Data Cleaning

Removal of 0 values and repeat the process of finding correlated values

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import operator

individual_features_df = []
for i in range(0, len(df_num.columns) - 1):
# -1 because the last column is SalePrice
tmpDf = df_num[[df_num.columns[i], 'SalePrice']]
tmpDf = tmpDf[tmpDf[df_num.columns[i]] != 0]
individual_features_df.append(tmpDf)

all_correlations = {feature.columns[0]: feature.corr()['SalePrice'][0] for feature in individual_features_df}
all_correlations = sorted(all_correlations.items(), key=operator.itemgetter(1))

for (key, value) in all_correlations:
print("{:>15}: {:>15}".format(key, value))
 KitchenAbvGr: -0.13920069217785566
     HalfBath: -0.08439171127179887
   MSSubClass: -0.08428413512659523
  OverallCond: -0.0778558940486776
       YrSold: -0.028922585168730426
 BsmtHalfBath: -0.028834567185481712
     PoolArea: -0.014091521506356928
 BsmtFullBath: 0.011439163340408634
       MoSold: 0.04643224522381936
    3SsnPorch: 0.06393243256889079
  OpenPorchSF: 0.08645298857147708
      MiscVal: 0.08896338917298924
   Fireplaces: 0.1216605842136395
    BsmtUnfSF: 0.16926100049514192
 BedroomAbvGr: 0.18093669310849045
   WoodDeckSF: 0.19370601237520677
   BsmtFinSF2: 0.19895609430836586
EnclosedPorch: 0.2412788363011751
  ScreenPorch: 0.25543007954878405
      LotArea: 0.2638433538714063
 LowQualFinSF: 0.3000750165550133
  LotFrontage: 0.35179909657067854
   MasVnrArea: 0.4340902197568926
   BsmtFinSF1: 0.4716904265235731
  GarageYrBlt: 0.48636167748786213
 YearRemodAdd: 0.5071009671113867
    YearBuilt: 0.5228973328794967
 TotRmsAbvGrd: 0.5337231555820238
     FullBath: 0.5745626737760816
     1stFlrSF: 0.6058521846919166
   GarageArea: 0.6084052829168343
  TotalBsmtSF: 0.6096808188074366
   GarageCars: 0.6370954062078953
     2ndFlrSF: 0.6733048324568383
    GrLivArea: 0.7086244776126511
  OverallQual: 0.7909816005838047

Conclusion

The most strongly correlated values are as follows in the golden_features_list.

1
2
golden_features_list = [key for key, value in all_correlations if abs(value) >= 0.5]
print("Following are the top {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))
Following are the top 11 strongly correlated values with SalePrice:
['YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GarageCars', '2ndFlrSF', 'GrLivArea', 'OverallQual']

Feature to Feature Relationship

1
2
3
4
5
6
corr = df_num.drop('SalePrice', axis=1).corr()
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],
cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
annot=True, annot_kws={"size": 8}, square=True);

EDA_Practice_-_HousePrices_25_0


Q –> Q (Quantitative to Quantitative relationship)

1
2
3
4
5
6
7
quantitative_features_list = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF',
'2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']

df_quantitative_values = df[quantitative_features_list]
df_quantitative_values.head()

LotFrontage LotArea MasVnrArea BsmtFinSF1 BsmtFinSF2 TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea ... GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal SalePrice
0 65.0 8450 196.0 706 0 856 856 854 0 1710 ... 2 548 0 61 0 0 0 0 0 208500
1 80.0 9600 0.0 978 0 1262 1262 0 0 1262 ... 2 460 298 0 0 0 0 0 0 181500
2 68.0 11250 162.0 486 0 920 920 866 0 1786 ... 2 608 0 42 0 0 0 0 0 223500
3 60.0 9550 0.0 216 0 756 961 756 0 1717 ... 3 642 0 35 272 0 0 0 0 140000
4 84.0 14260 350.0 655 0 1145 1145 1053 0 2198 ... 3 836 192 84 0 0 0 0 0 250000

5 rows × 28 columns

1
2
3
features_to_analyse = [x for x in quantitative_features_list if x in golden_features_list]
features_to_analyse.append('SalePrice')
features_to_analyse
['TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea',
 'SalePrice']

The Distribution

1
2
3
4
5
fig, ax = plt.subplots(round(len(features_to_analyse) / 3), 3, figsize= (18, 12))

for i, ax in enumerate(fig.axes):
if i < len(features_to_analyse) - 1:
sns.regplot(x = features_to_analyse[i], y='SalePrice', data=df[features_to_analyse], ax=ax)

EDA_Practice_-_HousePrices_31_0

We can see that features such as TotalBsmtSF, 1stFlrSF, GrLivArea have a big spread but I cannot tell what insights this information gives us


C -> Q (Categorical to Quantitative Relationship)

1
2
3
4
# quantitative_features_list[:-1] as the last column is SalePrice and we want to keep it
categorical_features = [a for a in quantitative_features_list[:-1] + df.columns.tolist() if (a not in quantitative_features_list[:-1]) or (a not in df.columns.tolist())]
df_categ = df[categorical_features]
df_categ.head()

MSSubClass MSZoning Street LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 ... GarageYrBlt GarageFinish GarageQual GarageCond PavedDrive MoSold YrSold SaleType SaleCondition SalePrice
0 60 RL Pave Reg Lvl AllPub Inside Gtl CollgCr Norm ... 2003.0 RFn TA TA Y 2 2008 WD Normal 208500
1 20 RL Pave Reg Lvl AllPub FR2 Gtl Veenker Feedr ... 1976.0 RFn TA TA Y 5 2007 WD Normal 181500
2 60 RL Pave IR1 Lvl AllPub Inside Gtl CollgCr Norm ... 2001.0 RFn TA TA Y 9 2008 WD Normal 223500
3 70 RL Pave IR1 Lvl AllPub Corner Gtl Crawfor Norm ... 1998.0 Unf TA TA Y 2 2006 WD Abnorml 140000
4 60 RL Pave IR1 Lvl AllPub FR2 Gtl NoRidge Norm ... 2000.0 RFn TA TA Y 12 2008 WD Normal 250000

5 rows × 49 columns


non-numerical features

1
2
df_not_num = df_categ.select_dtypes(include = ['O']) # Object(O)
print('There are {} non numerical features including:\n{}'.format(len(df_not_num.columns), df_not_num.columns.tolist()))
There are 39 non numerical features including:
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

“Looking at these features we can see that a lot of them are of the type Object(O). In our data transformation notebook we could use Pandas categorical functions (equivalent to R’s factor) to shape our data in a way that would be interpretable for our machine learning algorithm. ExterQual for instace could be transformed to an ordered categorical object.”

Definitely need more study on this especially on the transformation of data

1
2
3
4
plt.figure(figsize= (10, 6))
ax = sns.boxplot(x='BsmtExposure', y='SalePrice', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)
(array([0, 1, 2, 3]), <a list of 4 Text xticklabel objects>)

EDA_Practice_-_HousePrices_37_1

1
2
3
4
plt.figure(figsize= (12, 6))
ax = sns.boxplot(x='SaleCondition', y='SalePrice', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)

EDA_Practice_-_HousePrices_38_1

1
2
3
4
5
6
7
8
fig, axes = plt.subplots(round(len(df_not_num.columns) / 3), 3, figsize=(12, 30))

for i, ax in enumerate(fig.axes):
if i < len(df_not_num.columns):
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
sns.countplot(x=df_not_num.columns[i], alpha=0.7, data=df_not_num, ax=ax)

fig.tight_layout()

EDA_Practice_-_HousePrices_39_0


Deduction

Some categories are predominant for some features
e.g. Utilities, Heating, etc.

Welcome to Hexo! This is your very first post. Check documentation for more info. If you get any problems when using Hexo, you can find the answer in troubleshooting or you can ask me on GitHub.

Quick Start

Create a new post

1
$ hexo new "My New Post"

More info: Writing

Run server

1
$ hexo server

More info: Server

Generate static files

1
$ hexo generate

More info: Generating

Deploy to remote sites

1
$ hexo deploy

More info: Deployment