# PART1 基礎分析をしよう
# おまじない
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
# データの読み込み
# train.csvとtest.csvとsubmit_sample.csvの3つのファイルを読み込みましょう
# それぞれを読み込んだものは変数はtrain, test, sampleに代入しましょう
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("submit_sample.csv", header=None)
# trainの先頭行を見てみましょう
# 各カラムの説明はDeepAnalyticsのデータダウンロードページに記載されています
train.head()
id age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 1 39 blue-collar married secondary no 1756 yes no cellular 3 apr 939 1 -1 0 unknown 1
1 2 51 entrepreneur married primary no 1443 no no cellular 18 feb 172 10 -1 0 unknown 1
2 3 36 management single tertiary no 436 no no cellular 13 apr 567 1 595 2 failure 1
3 4 63 retired married secondary no 474 no no cellular 25 jan 423 1 -1 0 unknown 1
4 5 31 management single tertiary no 354 no no cellular 30 apr 502 1 9 2 success 1
# trainの行数と列数を見てみましょう
train.shape
(27128, 18)
# trainの基礎統計量を見てみましょう
# train.describe()
id age balance day duration campaign pdays previous y
count 27128.000000 27128.000000 27128.000000 27128.000000 27128.000000 27128.000000 27128.000000 27128.000000 27128.000000
mean 13564.500000 40.951010 1355.800870 15.806215 260.711295 2.751769 40.528052 0.579733 0.117001
std 7831.323388 10.608542 3003.305272 8.337904 260.091727 3.126594 100.382462 2.503653 0.321427
min 1.000000 18.000000 -6847.000000 1.000000 0.000000 1.000000 -1.000000 0.000000 0.000000
25% 6782.750000 33.000000 72.000000 8.000000 104.000000 1.000000 -1.000000 0.000000 0.000000
50% 13564.500000 39.000000 449.000000 16.000000 182.000000 2.000000 -1.000000 0.000000 0.000000
75% 20346.250000 48.000000 1428.000000 21.000000 323.000000 3.000000 -1.000000 0.000000 0.000000
max 27128.000000 95.000000 102127.000000 31.000000 4918.000000 63.000000 871.000000 275.000000 1.000000
# testの基礎統計量も見てみましょう
test.describe()
id age balance day duration campaign pdays previous
count 18083.000000 18083.000000 18083.000000 18083.000000 18083.000000 18083.000000 18083.000000 18083.000000
mean 9042.000000 40.914008 1371.980092 15.806725 254.340264 2.781950 39.702428 0.581209
std 5220.256794 10.634331 3105.985293 8.299509 253.591981 3.054651 99.747634 1.965265
min 1.000000 18.000000 -8019.000000 1.000000 0.000000 1.000000 -1.000000 0.000000
25% 4521.500000 33.000000 72.000000 8.000000 102.000000 1.000000 -1.000000 0.000000
50% 9042.000000 39.000000 447.000000 16.000000 178.000000 2.000000 -1.000000 0.000000
75% 13562.500000 48.000000 1427.000000 21.000000 314.000000 3.000000 -1.000000 0.000000
max 18083.000000 95.000000 98417.000000 31.000000 3881.000000 50.000000 854.000000 58.000000
# trainとtestに欠損がないか見てみましょう
train.isnull().sum()
id 0
age 0
job 0
marital 0
education 0
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
duration 0
campaign 0
pdays 0
previous 0
poutcome 0
y 0
dtype: int64
test.isnull().sum()
id 0
age 0
job 0
marital 0
education 0
default 0
balance 0
housing 0
loan 0
contact 0
day 0
month 0
duration 0
campaign 0
pdays 0
previous 0
poutcome 0
dtype: int64
# trainでyが1の人数はどれくらいか見てみましょう
train["y"].value_counts()
0 23954
1 3174
Name: y, dtype: int64
# maritalとyのクロス集計をしてみましょう
pd.crosstab関数を使います
pd.crosstab(X["A"], X["B"])と書いた場合、Aが縦列、Bが横列となります
# 更にオプションとしてmargins=Trueを書くと、総計値のカラムまで作成されるので便利です
pd.crosstab(train["marital"],train["y"],margins=True)
pd.crosstab(train["marital"],train["y"],margins=True)
y 0 1 All
marital
divorced 2691 364 3055
married 14744 1667 16411
single 6519 1143 7662
All 23954 3174 27128
pd.crosstab(train["y"],train["marital"],margins=True)
marital divorced married single All
y
0 2691 14744 6519 23954
1 364 1667 1143 3174
All 3055 16411 7662 27128
# ageをビニングしてみましょう
# ageは数値データなので、クロス集計をする為にはビニングが必要です
# ビニングとは数値データを例えば、グループ①(0より大きい、10以下)、グループ②(10より大きい、20以下)…のように集約することを言います
# ビニングはpd.cut関数を使います
# オプションには、①ビニングしたいデータ、②どう区切るのか?(例えば[0,10,20])を書きます
# まずtrain["age"]の基本統計量を確認しましょう
# その後、ビニングした結果を変数age_biningに代入しましょう
train["age"].describe()
count 27128.000000
mean 40.951010
std 10.608542
min 18.000000
25% 33.000000
50% 39.000000
75% 48.000000
max 95.000000
Name: age, dtype: float64
# age_bining = pd.cut(train["age"], [0, 20, 30, 40, 50, 60, 100])
# age_biningとyを使ってクロス集計をしてみましょう
age_bining
0 (30, 40]
1 (50, 60]
2 (30, 40]
3 (60, 100]
4 (30, 40]
...
27123 (40, 50]
27124 (30, 40]
27125 (30, 40]
27126 (30, 40]
27127 (20, 30]
Name: age, Length: 27128, dtype: category
Categories (6, interval[int64]): [(0, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 100]]
pd.crosstab(age_bining,train["y"],margins=True)
y 0 1 All
age
(0, 20] 39 21 60
(20, 30] 3486 655 4141
(30, 40] 9488 1111 10599
(40, 50] 6117 602 6719
(50, 60] 4416 488 4904
(60, 100] 408 297 705
All 23954 3174 27128
# 実習
# 1.testの行数とカラム数を確かめてみよう
.shape
test.shape
(18083, 17)
# 2.trainのデータには数値データが何カラムあり、質的データが何カラムあるか調べてみよう
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27128 entries, 0 to 27127
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 27128 non-null int64
1 age 27128 non-null int64
2 job 27128 non-null object
3 marital 27128 non-null object
4 education 27128 non-null object
5 default 27128 non-null object
6 balance 27128 non-null int64
7 housing 27128 non-null object
8 loan 27128 non-null object
9 contact 27128 non-null object
10 day 27128 non-null int64
11 month 27128 non-null object
12 duration 27128 non-null int64
13 campaign 27128 non-null int64
14 pdays 27128 non-null int64
15 previous 27128 non-null int64
16 poutcome 27128 non-null object
17 y 27128 non-null int64
dtypes: int64(9), object(9)
memory usage: 3.7+ MB
# 3.poutcomeとyのクロス集計をし、変数poutに代入しよう
# margins=Trueはつけましょう
pout = pd.crosstab(train["poutcome"],train["y"],margins=True)
out
pout
y 0 1 All
poutcome
failure 2578 391 2969
other 950 173 1123
success 312 574 886
unknown 20114 2036 22150
All 23954 3174 27128
# 4.trainのpoutcomeの各値毎に1となる割合を計算し、poutに新たなカラムrateを追加しよう
# poutcomeの各値の合計値に対して1となる割を求めることになるので、pout[1] / pout["All"] で求めることができます
pout["rate"] = pout[1] / pout["All"]
pout
pout
y 0 1 All rate
poutcome
failure 2578 391 2969 0.131694
other 950 173 1123 0.154052
success 312 574 886 0.647856
unknown 20114 2036 22150 0.091919
All 23954 3174 27128 0.117001
# 5.trainのdurationをビニングして、変数duration_biningを作りましょう
# train["duration"]の基礎統計量を確認しておきましょう
# 区切り方は[-1,100,200,300,400,500,600,700,800,900,1000,5000]としましょう
train["duration"].describe()
count 27128.000000
mean 260.711295
std 260.091727
min 0.000000
25% 104.000000
50% 182.000000
75% 323.000000
max 4918.000000
Name: duration, dtype: float64
duration_bining = pd.cut(train["duration"], [-1,100,200,300,400,500,600,700,800,900,1000,5000])
duration_bining
0 (900, 1000]
1 (100, 200]
2 (500, 600]
3 (400, 500]
4 (500, 600]
...
27123 (200, 300]
27124 (300, 400]
27125 (400, 500]
27126 (-1, 100]
27127 (100, 200]
Name: duration, Length: 27128, dtype: category
Categories (11, interval[int64]): [(-1, 100] < (100, 200] < (200, 300] < (300, 400] ... (700, 800] < (800, 900] < (900, 1000] < (1000, 5000]]
# 6.duration_biningとyを使って、クロス集計をし、変数duraに代入しよう
# margins=Trueはつけましょう
dura = pd.crosstab(duration_bining,train["y"],margins=True)
dura
dura
y 0 1 All
duration
(-1, 100] 6379 60 6439
(100, 200] 7909 450 8359
(200, 300] 4286 557 4843
(300, 400] 2258 408 2666
(400, 500] 1231 308 1539
(500, 600] 699 270 969
(600, 700] 441 259 700
(700, 800] 245 189 434
(800, 900] 149 153 302
(900, 1000] 95 123 218
(1000, 5000] 262 397 659
All 23954 3174 27128
# 7.duraの各値毎に1となる割合を計算し、duraに新たなカラムrateを追加しよう
All
dura["rate"] = dura[1] / dura["All"]
dura
dura
y 0 1 All rate
duration
(-1, 100] 6379 60 6439 0.009318
(100, 200] 7909 450 8359 0.053834
(200, 300] 4286 557 4843 0.115011
(300, 400] 2258 408 2666 0.153038
(400, 500] 1231 308 1539 0.200130
(500, 600] 699 270 969 0.278638
(600, 700] 441 259 700 0.370000
(700, 800] 245 189 434 0.435484
(800, 900] 149 153 302 0.506623
(900, 1000] 95 123 218 0.564220
(1000, 5000] 262 397 659 0.602428
All 23954 3174 27128 0.117001