第11章:ランダムフォレスト - ロングショート戦略 第4節: 日本株特徴量
ここでは、これから予測モデルを作成する前の特徴量を作る処理を行います。
インポートと設定
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
import talib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
idx = pd.IndexSlice
データ取得
DATA_DIR = Path('..', 'data')
DATA_DIR / 'assets.h5'
prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'stooq/jp/tse/stocks/prices')
.loc[idx[:, '2010': '2019'], :])
prices.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10286682 entries, ('1301.JP', Timestamp('2005-03-22 00:00:00')) to ('9997.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 open 10286682 non-null float64
1 high 10286682 non-null float64
2 low 10286682 non-null float64
3 close 10286682 non-null float64
4 volume 10286682 non-null int64
dtypes: float64(4), int64(1)
memory usage: 432.0+ MB
'''
before = len(prices.index.unique('ticker').unique())
欠損値を取り除く
prices = (prices.unstack('ticker')
.sort_index()
.ffill(limit=5)
.dropna(axis=1)
.stack('ticker')
.swaplevel())
prices.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 873446 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 open 873446 non-null float64
1 high 873446 non-null float64
2 low 873446 non-null float64
3 close 873446 non-null float64
4 volume 873446 non-null float64
dtypes: float64(5)
memory usage: 36.7+ MB
'''
after = len(prices.index.unique('ticker').unique())
print(f'Before: {before:,.0f} after: {after:,.0f}')
'''
Before: 3,667 after: 178
'''
最も取引されてるシンボルを残す
dv = prices.close.mul(prices.volume)
keep = dv.groupby('ticker').median().nlargest(1000).index.tolist()
prices = prices.loc[idx[keep, :], :]
prices.info()
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 873446 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 open 873446 non-null float64
1 high 873446 non-null float64
2 low 873446 non-null float64
3 close 873446 non-null float64
4 volume 873446 non-null float64
dtypes: float64(5)
memory usage: 36.7+ MB
'''
特徴量エンジニアリング
intervals = [1, 5, 10, 21, 63]
returns = []
by_ticker = prices.groupby(level='ticker').close
for t in intervals:
returns.append(by_ticker.pct_change(t).to_frame(f'ret_{t}'))
returns = pd.concat(returns, axis=1)
returns.info()
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 873446 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ret_1 873268 non-null float64
1 ret_5 872556 non-null float64
2 ret_10 871666 non-null float64
3 ret_21 869708 non-null float64
4 ret_63 862232 non-null float64
dtypes: float64(5)
memory usage: 36.7+ MB
'''
異常値を取り除く
max_ret_by_sym = returns.groupby(level='ticker').max()
percentiles = [0.001, .005, .01, .025, .05, .1]
percentiles += [1-p for p in percentiles]
max_ret_by_sym.describe(percentiles=sorted(percentiles)[6:])
quantiles = max_ret_by_sym.quantile(.95)
to_drop = []
for ret, q in quantiles.items():
to_drop.extend(max_ret_by_sym[max_ret_by_sym[ret]>q].index.tolist())
to_drop = pd.Series(to_drop).value_counts()
to_drop = to_drop[to_drop > 1].index.tolist()
len(to_drop)
'''
11
'''
prices = prices.drop(to_drop, level='ticker')
prices.info()
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 819469 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 open 819469 non-null float64
1 high 819469 non-null float64
2 low 819469 non-null float64
3 close 819469 non-null float64
4 volume 819469 non-null float64
dtypes: float64(5)
memory usage: 34.5+ MB
'''
相対リターンのパーセンタイルを計算
returns = []
by_sym = prices.groupby(level='ticker').close
for t in intervals:
ret = by_sym.pct_change(t)
rel_perc = (ret.groupby(level='date')
.apply(lambda x: pd.qcut(x, q=20, labels=False, duplicates='drop')))
returns.extend([ret.to_frame(f'ret_{t}'), rel_perc.to_frame(f'ret_rel_perc_{t}')])
returns = pd.concat(returns, axis=1)
テクニカルインジケーター
Percentage Price Oscillator
ppo = prices.groupby(level='ticker').close.apply(talib.PPO).to_frame('PPO')
Normalized Average True Range
natr = prices.groupby(level='ticker', group_keys=False).apply(lambda x: talib.NATR(x.high, x.low, x.close)).to_frame('NATR')
RSI
rsi = prices.groupby(level='ticker').close.apply(talib.RSI).to_frame('RSI')
Bollinger Bands
def get_bollinger(x):
u, m, l = talib.BBANDS(x)
return pd.DataFrame({'u': u, 'm': m, 'l': l})
bbands = prices.groupby(level='ticker').close.apply(get_bollinger)
特徴が結合される
data = pd.concat([prices, returns, ppo, natr, rsi, bbands], axis=1)
data['bbl'] = data.close.div(data.l)
data['bbu'] = data.u.div(data.close)
data = data.drop(['u', 'm', 'l'], axis=1)
data.bbu.corr(data.bbl, method='spearman')
'''
-0.17464878509378576
'''
ランダムサンプルティッカーのインジケータープロット
indicators = ['close', 'bbl', 'bbu', 'PPO', 'NATR', 'RSI']
ticker = np.random.choice(data.index.get_level_values('ticker'))
(data.loc[idx[ticker, :], indicators].reset_index('ticker', drop=True)
.plot(lw=1, subplots=True, figsize=(16, 10), title=indicators, layout=(3, 2), legend=False))
plt.suptitle(ticker, fontsize=14)
sns.despine()
plt.tight_layout()
plt.subplots_adjust(top=.95)
data = data.drop(prices.columns, axis=1)
時間系インジケーター作成
dates = data.index.get_level_values('date')
data['weekday'] = dates.weekday
data['month'] = dates.month
data['year'] = dates.year
フォワードリターンの計算
outcomes = []
by_ticker = data.groupby('ticker')
for t in intervals:
k = f'fwd_ret_{t:02}'
outcomes.append(k)
data[k] = by_ticker[f'ret_{t}'].shift(-t)
data.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 819469 entries, ('1332.JP', Timestamp('2000-01-04 00:00:00')) to ('9735.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 23 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ret_1 819302 non-null float64
1 ret_rel_perc_1 818968 non-null float64
2 ret_5 818634 non-null float64
3 ret_rel_perc_5 818634 non-null float64
4 ret_10 817799 non-null float64
5 ret_rel_perc_10 817799 non-null float64
6 ret_21 815962 non-null float64
7 ret_rel_perc_21 815962 non-null float64
8 ret_63 808948 non-null float64
9 ret_rel_perc_63 808948 non-null float64
10 PPO 815294 non-null float64
11 NATR 817131 non-null float64
12 RSI 817131 non-null float64
13 bbl 818801 non-null float64
14 bbu 818801 non-null float64
15 weekday 819469 non-null int64
16 month 819469 non-null int64
17 year 819469 non-null int64
18 fwd_ret_01 819302 non-null float64
19 fwd_ret_05 818634 non-null float64
20 fwd_ret_10 817799 non-null float64
21 fwd_ret_21 815962 non-null float64
22 fwd_ret_63 808948 non-null float64
dtypes: float64(20), int64(3)
memory usage: 147.0+ MB
'''
data.to_hdf('data.h5', 'stooq/japan/equities')
この記事が気に入ったらサポートをしてみませんか?