第12章:戦略をブースティングする 第4節: モデルデータの準備
ここの内容は以前にも紹介した記憶がありますが、一応、紹介しておきます。
インポートと設定
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import talib
from talib import RSI, BBANDS, MACD, ATR
MONTH = 21
YEAR = 12 * MONTH
START = '2010-01-01'
END = '2017-12-31'
sns.set_style('darkgrid')
idx = pd.IndexSlice
percentiles = [.001, .01, .02, .03, .04, .05]
percentiles += [1-p for p in percentiles[::-1]]
T = [1, 5, 10, 21, 42, 63]
Quandl Wiki Stock Pricesとメタデータの読み込み
DATA_STORE = '../data/assets.h5'
ohlcv = ['adj_open', 'adj_close', 'adj_low', 'adj_high', 'adj_volume']
with pd.HDFStore(DATA_STORE) as store:
prices = (store['quandl/wiki/prices']
.loc[idx[START:END, :], ohlcv]
.rename(columns=lambda x: x.replace('adj_', ''))
.swaplevel()
.sort_index())
metadata = (store['us_equities/stocks'].loc[:, ['marketcap', 'sector']])
prices.volume /= 1e3
prices.index.names = ['symbol', 'date']
metadata.index.name = 'symbol'
観測数が少ない銘柄を除去
min_obs = 7 * YEAR
nobs = prices.groupby(level='symbol').size()
keep = nobs[nobs > min_obs].index
prices = prices.loc[idx[keep, :], :]
価格とメタデータを並び替える
metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]
metadata.sector = metadata.sector.str.lower().str.replace(' ', '_')
shared = (prices.index.get_level_values('symbol').unique()
.intersection(metadata.index))
metadata = metadata.loc[shared, :]
prices = prices.loc[idx[shared, :], :]
時価総額上位1000銘柄に絞る
universe = metadata.marketcap.nlargest(1000).index
prices = prices.loc[idx[universe, :], :]
metadata = metadata.loc[universe]
セクター数
metadata.sector.value_counts()
'''
consumer_services 187
finance 168
technology 116
health_care 103
capital_goods 94
basic_industries 67
public_utilities 66
consumer_non-durables 61
energy 51
consumer_durables 36
miscellaneous 28
transportation 23
Name: sector, dtype: int64
'''
価格
prices.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2004775 entries, ('A', Timestamp('2010-01-04 00:00:00')) to ('ZION', Timestamp('2017-12-29 00:00:00'))
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 open 2004775 non-null float64
1 close 2004775 non-null float64
2 low 2004775 non-null float64
3 high 2004775 non-null float64
4 volume 2004775 non-null float64
dtypes: float64(5)
memory usage: 84.3+ MB
'''
メタデータ
metadata.info()
'''
<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, AAPL to NTCT
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 marketcap 1000 non-null float64
1 sector 1000 non-null object
dtypes: float64(1), object(1)
memory usage: 23.4+ KB
'''
ドル出来高
# compute dollar volume to determine universe
prices['dollar_vol'] = prices.loc[:, 'close'].mul(prices.loc[:, 'volume'], axis=0).div(1e3)
prices['dollar_vol'] = (prices
.groupby('symbol',
group_keys=False,
as_index=False)
.dollar_vol
.rolling(window=21)
.mean()
.fillna(0)
.reset_index(level=0, drop=True))
prices['dollar_vol_rank'] = (prices
.groupby('date')
.dollar_vol
.rank(ascending=False))
prices = prices.drop('dollar_vol', axis=1)
RSI
prices['rsi'] = prices.groupby(level='symbol').close.apply(RSI)
ボリンジャーバンド
def compute_bb(close):
high, mid, low = BBANDS(close, timeperiod=20)
return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
prices = (prices.join(prices
.groupby(level='symbol')
.close
.apply(compute_bb)))
prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)
ATR
prices['NATR'] = prices.groupby(level='symbol',
group_keys=False).apply(lambda x:
talib.NATR(x.high, x.low, x.close))
def compute_atr(stock_data):
df = ATR(stock_data.high, stock_data.low,
stock_data.close, timeperiod=14)
return df.sub(df.mean()).div(df.std())
prices['ATR'] = (prices.groupby('symbol', group_keys=False)
.apply(compute_atr))
MACD
prices['PPO'] = prices.groupby(level='symbol').close.apply(talib.PPO)
def compute_macd(close):
macd = MACD(close)[0]
return (macd - np.mean(macd))/np.std(macd)
価格とメタデータを結合
metadata.sector = pd.factorize(metadata.sector)[0].astype(int)
prices = prices.join(metadata[['sector']])
ヒストリカルリターン
by_sym = prices.groupby(level='symbol').close
for t in T:
prices[f'r{t:02}'] = by_sym.pct_change(t)
日次ヒストリカルリターン区分位(Quantile)
for t in T:
prices[f'r{t:02}dec'] = (prices[f'r{t:02}'].groupby(level='date')
.apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates='drop')))
日次セクターリターン区分位(Quantile)
for t in T:
prices[f'r{t:02}q_sector'] = (prices.groupby(
['date', 'sector'])[f'r{t:02}'].transform(
lambda x: pd.qcut(x, q=5, labels=False, duplicates='drop')))
フォワードリターン
for t in [1, 5, 21]:
prices[f'r{t:02}_fwd'] = prices.groupby(level='symbol')[f'r{t:02}'].shift(-t)
異常値を取り除く
outliers = prices[prices.r01>1].index.get_level_values('symbol').unique()
prices = prices.drop(outliers, level='symbol')
時間とセクターのダミー変数作成
prices['year'] = prices.index.get_level_values('date').year
prices['month'] = prices.index.get_level_values('date').month
prices['weekday'] = prices.index.get_level_values('date').weekday
モデルデータの保存
prices.drop(['open', 'close', 'low', 'high', 'volume'], axis=1).to_hdf('data.h5', 'model_data')
この記事が気に入ったらサポートをしてみませんか?