import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression as LR
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample.csv",header=None)
train.head()
datetime y week soldout name kcal remarks event payday weather precipitation temperature
0 2013-11-18 90 月 0 厚切りイカフライ NaN NaN NaN NaN 快晴 -- 19.8
1 2013-11-19 101 火 1 手作りヒレカツ NaN NaN NaN NaN 快晴 -- 17.0
2 2013-11-20 118 水 0 白身魚唐揚げ野菜あん NaN NaN NaN NaN 快晴 -- 15.5
3 2013-11-21 120 木 1 若鶏ピリ辛焼 NaN NaN NaN NaN 快晴 -- 15.2
4 2013-11-22 130 金 1 ビッグメンチカツ NaN NaN NaN NaN 快晴 -- 16.1
trainX = train["temperature"]
y = train["y"]
testX = test["temperature"]
trainX = trainX.values.reshape(-1,1)
testX = testX.values.reshape(-1,1)
model1 = LR()
model1.fit(trainX,y)
LinearRegression()
model1.coef_
array([-2.5023821])
model1.intercept_
134.79948383749922
pred = model1.predict(testX)
pred
array([ 84.25136537, 74.99255159, 62.9811175 , 80.99826864,
79.49683938, 76.49398085, 78.49588653, 69.48731097,
67.73564349, 95.76232304, 88.25517673, 77.24469548,
74.99255159, 84.25136537, 96.01256125, 98.01446693,
88.00493852, 77.99541011, 87.00398568, 85.75279463,
85.75279463, 87.75470031, 87.5044621 , 94.7613702 ,
86.75374747, 84.50160358, 81.74898327, 98.01446693,
98.76518156, 86.00303284, 95.26184662, 99.01541977,
94.51113199, 97.51399051, 108.52447176, 97.51399051,
106.27232787, 111.7775685 , 97.01351409, 96.51303767])
0 1
0 2014-10-1 24
1 2014-10-2 71
2 2014-10-3 25
3 2014-10-6 32
4 2014-10-7 60
0 1
0 2014-10-1 84.251365
1 2014-10-2 74.992552
2 2014-10-3 62.981117
3 2014-10-6 80.998269
4 2014-10-7 79.496839
sample.to_csv("submit1.csv",index=None,header=None)
train.isnull().sum()
datetime 0
y 0
week 0
soldout 0
name 0
kcal 41
remarks 186
event 193
payday 197
weather 0
precipitation 0
temperature 0
dtype: int64
test.isnull().sum()
datetime 0
week 0
soldout 0
name 0
kcal 4
remarks 33
event 37
payday 38
weather 0
precipitation 0
temperature 0
dtype: int64
avg = train["kcal"].mean()
avg
404.4096385542169
trainX = train["kcal"].fillna(avg)
testX = test["kcal"].fillna(avg)
trainX
0 404.409639
1 404.409639
2 404.409639
3 404.409639
4 404.409639
...
202 408.000000
203 394.000000
204 404.409639
205 404.000000
206 398.000000
Name: kcal, Length: 207, dtype: float64
trainX = trainX.values.reshape(-1,1)
testX = testX.values.reshape(-1,1)
y = train["y"]
model2 = LR()
model2.fit(trainX,y)
LinearRegression()
model2.coef_
array([0.13195178])
model2.intercept_
33.26061577029441
pred2 = model2.predict(testX)
pred2
array([88.68036439, 88.02060548, 86.70108765, 86.04132874, 89.99988221,
88.68036439, 90.65964112, 86.62318841, 87.096943 , 92.63891786,
86.04132874, 88.02060548, 89.47207508, 87.36084656, 88.02060548,
89.73597865, 86.62318841, 86.96499122, 89.3401233 , 89.99988221,
89.47207508, 90.79159291, 91.31940004, 89.3401233 , 87.36084656,
86.62318841, 87.62475013, 89.47207508, 88.41646082, 85.51352161,
89.60402686, 90.39573756, 87.22889478, 89.99988221, 88.81231617,
86.62318841, 93.95843568, 88.02060548, 88.68036439, 88.15255726])
sample[1] = pred2
sample.head()
0 1
0 2014-10-1 88.680364
1 2014-10-2 88.020605
2 2014-10-3 86.701088
3 2014-10-6 86.041329
4 2014-10-7 89.999882
sample.to_csv("submit2.csv",index=None,header=None)