第11章:ランダムフォレスト - ロングショート戦略 第5節: ランダムフォレストで売りと買いシグナルを生成する
ここでやること
こんにちは、前回まではデータの準備で色々とやっておりましたが、今回はモデルの作り方について紹介します。
最初に線形回帰で予測モデルを作成して、その後、LightGBMで予測モデルを作成して、二種類のモデルを情報係数基準で比較します。
最後に、最もよかったモデル達を利用して、アンサンブルモデルを作る方法も紹介します。
インポートと設定
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from time import time
from io import StringIO
import sys, os
from itertools import product
from pathlib import Path
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils import MultipleTimeSeriesCV, format_time
sns.set_style('whitegrid')
np.random.seed(42)
YEAR = 252
idx = pd.IndexSlice
DATA_DIR = Path('..', 'data')
results_path = Path('results', 'return_predictions')
if not results_path.exists():
results_path.mkdir(parents=True)
データ取得
data = pd.read_hdf('data.h5', 'stooq/japan/equities')
data.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2302060 entries, ('1332.JP', Timestamp('2010-01-04 00:00:00')) to ('9990.JP', Timestamp('2019-12-30 00:00:00'))
Data columns (total 23 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ret_1 2301120 non-null float64
1 ret_rel_perc_1 2301120 non-null float64
2 ret_5 2297360 non-null float64
3 ret_rel_perc_5 2297360 non-null float64
4 ret_10 2292660 non-null float64
5 ret_rel_perc_10 2292660 non-null float64
6 ret_21 2282320 non-null float64
7 ret_rel_perc_21 2282320 non-null float64
8 ret_63 2242840 non-null float64
9 ret_rel_perc_63 2242840 non-null float64
10 PPO 2278560 non-null float64
11 NATR 2288900 non-null float64
12 RSI 2288900 non-null float64
13 bbl 2298300 non-null float64
14 bbu 2298300 non-null float64
15 weekday 2302060 non-null int64
16 month 2302060 non-null int64
17 year 2302060 non-null int64
18 fwd_ret_01 2301120 non-null float64
19 fwd_ret_05 2297360 non-null float64
20 fwd_ret_10 2292660 non-null float64
21 fwd_ret_21 2282320 non-null float64
22 fwd_ret_63 2242840 non-null float64
dtypes: float64(20), int64(3)
memory usage: 412.9+ MB
'''
len(data.index.unique('ticker'))
'''
940
'''
出来高上位50銘柄を選定する。
本notebookでは250銘柄としているが、計算がまず終わらないので50銘柄に絞る。
prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'stooq/jp/tse/stocks/prices')
.loc[idx[:, '2010': '2017'], :])
dollar_vol = prices.close.mul(prices.volume).loc[idx[:, :'2017'],:]
dollar_vol_rank = dollar_vol.groupby(level='date').rank(ascending=False)
universe = dollar_vol_rank.groupby(level='ticker').mean().nsmallest(50).index
多重時系列交差検証
cv = MultipleTimeSeriesCV(n_splits=36,
test_period_length=21,
lookahead=5,
train_period_length=2 * 252)
for i, (train_idx, test_idx) in enumerate(cv.split(X=data)):
train = data.iloc[train_idx]
train_dates = train.index.get_level_values('date')
test = data.iloc[test_idx]
test_dates = test.index.get_level_values('date')
df = train.reset_index().append(test.reset_index())
n = len(df)
assert n== len(df.drop_duplicates())
msg = f'Training: {train_dates.min().date()}-{train_dates.max().date()} '
msg += f' ({train.groupby(level="ticker").size().value_counts().index[0]:,.0f} days) | '
msg += f'Test: {test_dates.min().date()}-{test_dates.max().date()} '
msg += f'({test.groupby(level="ticker").size().value_counts().index[0]:,.0f} days)'
print(msg)
if i == 3:
break
'''
Training: 2017-10-24-2019-11-25 (508 days) | Test: 2019-12-02-2019-12-30 (21 days)
Training: 2017-09-22-2019-10-24 (508 days) | Test: 2019-10-31-2019-11-29 (21 days)
Training: 2017-08-23-2019-09-20 (508 days) | Test: 2019-09-30-2019-10-30 (21 days)
Training: 2017-07-24-2019-08-21 (508 days) | Test: 2019-08-28-2019-09-27 (21 days)
'''
モデルの選定:期間とホライズン
期間は2017年までとする。
cv_data = data.loc[idx[universe, :'2017'], :]
tickers = cv_data.index.unique('ticker')
データを保存しておく。
cv_data.to_hdf('data.h5', 'stooq/japan/equities/cv_data')
with pd.HDFStore('data.h5') as store:
print(store.info())
予測ホライズン
lookaheads = [1, 5, 10, 21]
ベースライン: 線形回帰モデル
線形回帰については第7章に詳しく書いてある。
lr = LinearRegression()
labels = sorted(cv_data.filter(like='fwd').columns)
features = cv_data.columns.difference(labels).tolist()
CVのパラメーター
train_lengths = [5 * YEAR, 3 * YEAR, YEAR, 126, 63]
test_lengths = [5, 21]
test_params = list(product(lookaheads, train_lengths, test_lengths))
lr_metrics = []
for lookahead, train_length, test_length in test_params:
label = f'fwd_ret_{lookahead:02}'
df = cv_data.loc[:, features + [label]].dropna()
X, y = df.drop(label, axis=1), df[label]
n_splits = int(2 * YEAR / test_length)
cv = MultipleTimeSeriesCV(n_splits=n_splits,
test_period_length=test_length,
lookahead=lookahead,
train_period_length=train_length)
ic, preds = [], []
for i, (train_idx, test_idx) in enumerate(cv.split(X=X)):
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
preds.append(y_test.to_frame('y_true').assign(y_pred=y_pred))
ic.append(spearmanr(y_test, y_pred)[0])
preds = pd.concat(preds)
lr_metrics.append([
lookahead, train_length, test_length,
np.mean(ic),
spearmanr(preds.y_true, preds.y_pred)[0]
])
columns = ['lookahead', 'train_length', 'test_length', 'ic_by_day', 'ic']
lr_metrics = pd.DataFrame(lr_metrics, columns=columns)
lr_metrics.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 lookahead 40 non-null int64
1 train_length 40 non-null int64
2 test_length 40 non-null int64
3 ic_by_day 40 non-null float64
4 ic 40 non-null float64
dtypes: float64(2), int64(3)
memory usage: 1.7 KB
'''
Lookaheadによる情報係数分布
lr_metrics_long = pd.concat([(lr_metrics.drop('ic', axis=1)
.rename(columns={'ic_by_day': 'ic'})
.assign(Measured='By Day')),
lr_metrics.drop('ic_by_day', axis=1)
.assign(Measured='Overall')])
lr_metrics_long.columns=['Lookahead', 'Train Length', 'Test Length', 'IC', 'Measure']
lr_metrics_long.info()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 39
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Lookahead 80 non-null int64
1 Train Length 80 non-null int64
2 Test Length 80 non-null int64
3 IC 80 non-null float64
4 Measure 80 non-null object
dtypes: float64(1), int64(3), object(1)
memory usage: 3.8+ KB
'''
sns.catplot(x='Train Length',
y='IC',
hue='Test Length',
col='Lookahead',
row='Measure',
data=lr_metrics_long,
kind='bar')
fig, axes =plt.subplots(ncols=2, figsize=(14,5), sharey=True)
sns.boxplot(x='lookahead', y='ic_by_day',data=lr_metrics, ax=axes[0])
axes[0].set_title('IC by Day')
sns.boxplot(x='lookahead', y='ic',data=lr_metrics, ax=axes[1])
axes[1].set_title('IC Overall')
axes[0].set_ylabel('Information Coefficient')
axes[1].set_ylabel('')
sns.despine()
fig.tight_layout()
最適訓練/テスト期間
(lr_metrics.groupby('lookahead', group_keys=False)
.apply(lambda x: x.nlargest(3, 'ic')))
lr_metrics.to_csv(results_path / 'lin_reg_performance.csv', index=False)
LightGBM ランダムフォレストモデル チューニング
def get_fi(model):
fi = model.feature_importance(importance_type='gain')
return (pd.Series(fi / fi.sum(),
index=model.feature_name()))
base_params = dict(boosting_type='rf',
objective='regression',
bagging_freq=1,
verbose=-1)
ハイパーパラメーターオプション
bagging_fraction_opts = [.5, .75, .95]
feature_fraction_opts = [.75, .95]
min_data_in_leaf_opts = [250, 500, 1000]
組み合わせは3*3*2=18通り
cv_params = list(product(bagging_fraction_opts,
feature_fraction_opts,
min_data_in_leaf_opts))
n_cv_params = len(cv_params)
n_cv_params
'''
18
'''
ランダムサンプル
sample_proportion = .5
sample_size = int(sample_proportion * n_cv_params)
cv_param_sample = np.random.choice(list(range(n_cv_params)),
size=int(sample_size),
replace=False)
cv_params_ = [cv_params[i] for i in cv_param_sample]
print('# CV parameters:', len(cv_params_))
'''
# CV parameters: 9
'''
num_iterations = [25] + list(range(50, 501, 25))
num_boost_round = num_iterations[-1]
訓練/テスト期間
train_lengths = [5 * YEAR, 3 * YEAR, YEAR, 126, 63]
test_lengths = [5, 21]
test_params = list(product(train_lengths, test_lengths))
n_test_params = len(test_params)
sample_proportion = 1.0
sample_size = int(sample_proportion * n_test_params)
test_param_sample = np.random.choice(list(range(n_test_params)),
size=int(sample_size),
replace=False)
test_params_ = [test_params[i] for i in test_param_sample]
print('Train configs:', len(test_params_))
print('CV Iterations:', len(cv_params_) * len(test_params_))
'''
Train configs: 10
CV Iterations: 90
'''
カテゴリカル変数
categoricals = ['year', 'weekday', 'month']
for feature in categoricals:
data[feature] = pd.factorize(data[feature], sort=True)[0]
交差検証実行
labels = sorted(cv_data.filter(like='fwd').columns)
features = cv_data.columns.difference(labels).tolist()
label_dict = dict(zip(lookaheads, labels))
cv_store = Path(results_path / 'parameter_tuning.h5')
ic_cols = ['bagging_fraction',
'feature_fraction',
'min_data_in_leaf',
't'] + [str(n) for n in num_iterations]
以下のステップでモデルを作る。
・予測ホライズンと訓練/テスト期間を一個ずつ試す
・それぞれの対してMultipleTimeSeriesCVを設定する
・LightGBMのデータセットの目的変数をバイナリ化する
・ハイパーパラメーターから最適なモデルを探索する
for lookahead in lookaheads:
for train_length, test_length in test_params_:
n_splits = int(2 * YEAR / test_length)
print(f'Lookahead: {lookahead:2.0f} | Train: {train_length:3.0f} | '
f'Test: {test_length:2.0f} | Params: {len(cv_params_):3.0f}')
cv = MultipleTimeSeriesCV(n_splits=n_splits,
test_period_length=test_length,
train_period_length=train_length,
lookahead=lookahead)
label = label_dict[lookahead]
outcome_data = data.loc[:, features + [label]].dropna()
lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
label=outcome_data[label],
categorical_feature=categoricals,
free_raw_data=False)
predictions, daily_ic, ic, feature_importance = [], [], [], []
key = f'{lookahead}/{train_length}/{test_length}'
T = 0
for p, (bagging_fraction, feature_fraction, min_data_in_leaf) in enumerate(cv_params_):
params = base_params.copy()
params.update(dict(bagging_fraction=bagging_fraction,
feature_fraction=feature_fraction,
min_data_in_leaf=min_data_in_leaf))
start = time()
cv_preds, nrounds = [], []
for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
lgb_train = lgb_data.subset(train_idx.tolist()).construct()
lgb_test = lgb_data.subset(test_idx.tolist()).construct()
model = lgb.train(params=params,
train_set=lgb_train,
num_boost_round=num_boost_round,
verbose_eval=False)
if i == 0:
fi = get_fi(model).to_frame()
else:
fi[i] = get_fi(model)
test_set = outcome_data.iloc[test_idx, :]
X_test = test_set.loc[:, model.feature_name()]
y_test = test_set.loc[:, label]
y_pred = {str(n): model.predict(X_test, num_iteration=n)
for n in num_iterations}
cv_preds.append(y_test.to_frame(
'y_test').assign(**y_pred).assign(i=i))
nrounds.append(model.best_iteration)
feature_importance.append(fi.T.describe().T.assign(bagging_fraction=bagging_fraction,
feature_fraction=feature_fraction,
min_data_in_leaf=min_data_in_leaf))
cv_preds = pd.concat(cv_preds).assign(bagging_fraction=bagging_fraction,
feature_fraction=feature_fraction,
min_data_in_leaf=min_data_in_leaf)
predictions.append(cv_preds)
by_day = cv_preds.groupby(level='date')
ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test,
x[str(n)])[0]).to_frame(n)
for n in num_iterations], axis=1)
daily_ic.append(ic_by_day.assign(bagging_fraction=bagging_fraction,
feature_fraction=feature_fraction,
min_data_in_leaf=min_data_in_leaf))
cv_ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0]
for n in num_iterations]
T += time() - start
ic.append([bagging_fraction, feature_fraction,
min_data_in_leaf, lookahead] + cv_ic)
msg = f'{p:3.0f} | {format_time(T)} | '
msg += f'{bagging_fraction:3.0%} | {feature_fraction:3.0%} | {min_data_in_leaf:5,.0f} | '
msg += f'{max(cv_ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {ic_by_day.median().max(): 6.2%}'
print(msg)
m = pd.DataFrame(ic, columns=ic_cols)
m.to_hdf(cv_store, 'ic/' + key)
pd.concat(daily_ic).to_hdf(cv_store, 'daily_ic/' + key)
pd.concat(feature_importance).to_hdf(cv_store, 'fi/' + key)
pd.concat(predictions).to_hdf(cv_store, 'predictions/' + key)
交差検証 結果分析
分析用データを計算
以下の変数を結合する
id_vars = ['train_length',
'test_length',
'bagging_fraction',
'feature_fraction',
'min_data_in_leaf',
't', 'date']
ICの計算。
daily_ic, ic = [], []
for t in lookaheads:
print(t)
with pd.HDFStore(cv_store) as store:
keys = [k[1:] for k in store.keys() if k.startswith(f'/fi/{t}')]
for key in keys:
train_length, test_length = key.split('/')[2:]
print(train_length, test_length)
k = f'{t}/{train_length}/{test_length}'
cols = {'t': t,
'train_length': int(train_length),
'test_length': int(test_length)}
ic.append(pd.melt(store['ic/' + k]
.assign(**cols),
id_vars=id_vars[:-1],
value_name='ic',
var_name='rounds')
.apply(pd.to_numeric))
df = store['daily_ic/' + k].assign(**cols).reset_index()
daily_ic.append(pd.melt(df,
id_vars=id_vars,
value_name='daily_ic',
var_name='rounds')
.set_index('date')
.apply(pd.to_numeric)
.reset_index())
ic = pd.concat(ic, ignore_index=True)
daily_ic = pd.concat(daily_ic, ignore_index=True)
予測パフォーマンス: 日別CV情報係数
group_cols = ['t','train_length', 'test_length',
'bagging_fraction', 'feature_fraction', 'min_data_in_leaf']
daily_ic_avg = daily_ic.groupby(group_cols + ['rounds']).daily_ic.mean().to_frame('ic').reset_index()
daily_ic_avg.groupby('t', group_keys=False).apply(lambda x: x.nlargest(3, 'ic'))
daily_ic_avg.info(null_counts=True)
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4320 entries, 0 to 4319
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 t 4320 non-null int64
1 train_length 4320 non-null int64
2 test_length 4320 non-null int64
3 bagging_fraction 4320 non-null float64
4 feature_fraction 4320 non-null float64
5 min_data_in_leaf 4320 non-null int64
6 rounds 4320 non-null int64
7 ic 4320 non-null float64
dtypes: float64(3), int64(5)
memory usage: 270.1 KB
'''
日次では75%が正の日次ICを出していた。21日でも同じ傾向が見られた。
ax = sns.boxenplot(x='t', y='ic', data=daily_ic_avg)
ax.axhline(0, ls='--', lw=1, c='k');
訓練期間別
g = sns.catplot(x='t',
y='ic',
col='train_length',
row='test_length',
data=daily_ic_avg[(daily_ic_avg.test_length == 21)],
kind='boxen')
g.savefig(results_path / 'daily_ic_test_21', dpi=300);
lin_reg = {}
for t in [1, 5]:
df_ = daily_ic_avg[(daily_ic_avg.t==t)&(daily_ic_avg.rounds<=250)].dropna()
y, X = df_.ic, df_.drop(['ic', 't'], axis=1)
X = sm.add_constant(pd.get_dummies(X, columns=X.columns, drop_first=True))
model = sm.OLS(endog=y, exog=X)
lin_reg[t] = model.fit()
s = lin_reg[t].summary()
coefs = pd.read_csv(StringIO(s.tables[1].as_csv())).rename(
columns=lambda x: x.strip())
coefs.columns = ['variable', 'coef', 'std_err',
't', 'p_value', 'ci_low', 'ci_high']
coefs.to_csv(results_path / f'lr_result_{t:02}.csv', index=False)
def visualize_lr_result(model, ax):
ci = model.conf_int()
errors = ci[1].sub(ci[0]).div(2)
coefs = (model.params.to_frame('coef').assign(error=errors)
.reset_index().rename(columns={'index': 'variable'}))
coefs = coefs[~coefs['variable'].str.startswith(
'date') & (coefs.variable != 'const')]
coefs.variable = coefs.variable.str.split('_').str[-1]
coefs.plot(x='variable', y='coef', kind='bar', ax=ax,
color='none', capsize=3, yerr='error', legend=False, rot=0)
ax.set_ylabel('IC')
ax.set_xlabel('')
ax.scatter(x=pd.np.arange(len(coefs)), marker='_', s=120, y=coefs['coef'], color='black')
ax.axhline(y=0, linestyle='--', color='black', linewidth=1)
ax.xaxis.set_ticks_position('none')
ax.annotate('Train\nLength', xy=(.09, -0.1), xytext=(.09, -0.2),
xycoords='axes fraction',
textcoords='axes fraction',
fontsize=11, ha='center', va='bottom',
bbox=dict(boxstyle='square', fc='white', ec='black'),
arrowprops=dict(arrowstyle='-[, widthB=5, lengthB=0.8', lw=1.0, color='black'))
ax.annotate('Test\nLength', xy=(.23, -0.1), xytext=(.23, -0.2),
xycoords='axes fraction',
textcoords='axes fraction',
fontsize=11, ha='center', va='bottom',
bbox=dict(boxstyle='square', fc='white', ec='black'),
arrowprops=dict(arrowstyle='-[, widthB=2, lengthB=0.8', lw=1.0, color='black'))
ax.annotate('Bagging\nFraction', xy=(.32, -0.1), xytext=(.32, -0.2),
xycoords='axes fraction',
textcoords='axes fraction',
fontsize=11, ha='center', va='bottom',
bbox=dict(boxstyle='square', fc='white', ec='black'),
arrowprops=dict(arrowstyle='-[, widthB=2.7, lengthB=0.8', lw=1.0, color='black'))
ax.annotate('Feature\nFraction', xy=(.44, -0.1), xytext=(.44, -0.2),
xycoords='axes fraction',
textcoords='axes fraction',
fontsize=11, ha='center', va='bottom',
bbox=dict(boxstyle='square', fc='white', ec='black'),
arrowprops=dict(arrowstyle='-[, widthB=3.4, lengthB=1.0', lw=1.0, color='black'))
ax.annotate('Min.\nSamples', xy=(.55, -0.1), xytext=(.55, -0.2),
xycoords='axes fraction',
textcoords='axes fraction',
fontsize=11, ha='center', va='bottom',
bbox=dict(boxstyle='square', fc='white', ec='black'),
arrowprops=dict(arrowstyle='-[, widthB=2.5, lengthB=1.0', lw=1.0, color='black'))
ax.annotate('Number of\nRounds', xy=(.8, -0.1), xytext=(.8, -0.2),
xycoords='axes fraction',
textcoords='axes fraction',
fontsize=11, ha='center', va='bottom',
bbox=dict(boxstyle='square', fc='white', ec='black'),
arrowprops=dict(arrowstyle='-[, widthB=11.2, lengthB=1.0', lw=1.0, color='black'))
以下のプロットは、回帰係数値とその信頼区間を示しています。切片(図示していない)は小さな正の値を持ち、統計的に有意です。ドロップされたカテゴリーの影響(各パラメーターの最小値)を取り込みます。
1日間の予測では、すべてではありませんが一部の結果は洞察に富んでいます。21日間のテストの方が適切であり、min_samples_leafは500または1,000です。 100〜200本のツリーが最適に機能するように見えますが、トレーニング期間が短い場合も長い場合も、中間値よりも優れています。
with sns.axes_style('white'):
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
axes = axes.flatten()
for i, t in enumerate([1, 5]):
visualize_lr_result(lin_reg[t], axes[i])
axes[i].set_title(f'Lookahead: {t} Day(s)')
fig.suptitle('OLS Coefficients & Confidence Intervals', fontsize=20)
fig.tight_layout()
fig.subplots_adjust(top=.92)
情報係数:まとめ
しばしば報告されますが、必ずしもモデルのリターン予測と毎日のICを使用する毎日の取引戦略の目標とは一致しない、全体的なIC値も確認します。
g = sns.catplot(x='t',
y='ic',
col='train_length',
row='test_length',
data=ic[(ic.test_length == 21) & (ic.t < 21)],
kind='box')
t = 1
train_length = 756
test_length = 21
g = sns.catplot(x='rounds',
y='ic',
col='feature_fraction',
hue='bagging_fraction',
row='min_data_in_leaf',
data=ic[(ic.t == t) &
(ic.train_length == train_length) &
(ic.test_length == test_length)],
kind='swarm');
ランダムフォレスト vs 線形回帰
線形モデルデータ読み込み
lr_metrics = pd.read_csv(model_path / 'lin_reg_performance.csv')
with sns.axes_style("white"):
ax = (ic.groupby('t').ic.max().to_frame('Random Forest')
.join(lr_metrics.groupby('lookahead').ic.max().to_frame('Linear Regression')).plot.barh())
ax.set_ylabel('Lookahead')
ax.set_xlabel('Information Coefficient')
sns.despine()
plt.tight_layout();
予測を生成
10個のベストモデルを使用して、アンサンブルモデルを作る。
param_cols = ['train_length', 'test_length', 'bagging_fraction',
'feature_fraction', 'min_data_in_leaf', 'rounds']
def get_params(data, t=5, best=0):
df = data[data.t == t].sort_values('ic', ascending=False).iloc[best]
df = df.loc[param_cols]
rounds = int(df.rounds)
params = pd.to_numeric(df.drop('rounds'))
return params, rounds
base_params = dict(boosting_type='rf',
objective='regression',
bagging_freq=1,
verbose=-1)
store = Path(results_path / 'predictions.h5')
for lookahead in [1, 5, 10, 21]:
if lookahead > 1: continue
print(f'\nLookahead: {lookahead:02}')
data = (pd.read_hdf('data.h5', 'stooq/japan/equities'))
labels = sorted(data.filter(like='fwd').columns)
features = data.columns.difference(labels).tolist()
label = f'fwd_ret_{lookahead:02}'
data = data.loc[:, features + [label]].dropna()
categoricals = ['year', 'weekday', 'month']
for feature in categoricals:
data[feature] = pd.factorize(data[feature], sort=True)[0]
lgb_data = lgb.Dataset(data=data[features],
label=data[label],
categorical_feature=categoricals,
free_raw_data=False)
for position in range(10):
params, num_boost_round = get_params(daily_ic_avg,
t=lookahead,
best=position)
params = params.to_dict()
params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
train_length = int(params.pop('train_length'))
test_length = int(params.pop('test_length'))
params.update(base_params)
print(f'\tPosition: {position:02}')
n_splits = int(2 * YEAR / test_length)
cv = MultipleTimeSeriesCV(n_splits=n_splits,
test_period_length=test_length,
lookahead=lookahead,
train_period_length=train_length)
predictions = []
start = time()
for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
lgb_train = lgb_data.subset(train_idx.tolist()).construct()
model = lgb.train(params=params,
train_set=lgb_train,
num_boost_round=num_boost_round,
verbose_eval=False)
test_set = data.iloc[test_idx, :]
y_test = test_set.loc[:, label].to_frame('y_test')
y_pred = model.predict(test_set.loc[:, model.feature_name()])
predictions.append(y_test.assign(prediction=y_pred))
if position == 0:
test_predictions = (pd.concat(predictions)
.rename(columns={'prediction': position}))
else:
test_predictions[position] = pd.concat(predictions).prediction
by_day = test_predictions.groupby(level='date')
for position in range(10):
if position == 0:
ic_by_day = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0]).to_frame()
else:
ic_by_day[position] = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0])
test_predictions.to_hdf(store, f'test/{lookahead:02}')
この記事が気に入ったらサポートをしてみませんか?