くもすけさんのbacktestをpandasからできるだけnumpyで速くしてみた
元記事はこちら
ドテン&ピラミッディングのコードの一部をnumpyで計算して検証してみました
# ポジション計算部(ドテン&ピラミッディング)
pyramiding=3
df['order'] = 0
df['order'] = df['order'].where(df['long']!=True,1)
df['order'] = df['order'].where(df['short']!=True,-1)
df['pos'] = df['order'].where(df['order']!=0,).fillna(method='ffill').fillna(0)
df['pos'] = df.groupby((df['pos']*df['pos'].shift(1)<0).cumsum().fillna(0))['order'].cumsum()
df['pos'] = df['pos'].where(df['pos']<=pyramiding,pyramiding )
df['pos'] = df['pos'].where(df['pos']>=-pyramiding,-pyramiding )
print( df )
numpyにはffill()とshift()がないのでこれらの関数を追加
def np_shift(arr, num=1, fill_value=np.nan):
result = np.empty_like(arr)
if num > 0:
result[:num] = fill_value
result[num:] = arr[:-num]
elif num < 0:
result[num:] = fill_value
result[:num] = arr[-num:]
else:
result[:] = arr
return result
def np_ffill(arr, axis=0):
idx_shape = tuple([slice(None)] + [np.newaxis] * (len(arr.shape) - axis - 1))
idx = np.where(~np.isnan(arr), np.arange(arr.shape[axis])[idx_shape], 0)
np.maximum.accumulate(idx, axis=axis, out=idx)
slc = [np.arange(k)[tuple([slice(None) if dim==i else np.newaxis
for dim in range(len(arr.shape))])]
for i, k in enumerate(arr.shape)]
slc[axis] = idx
return arr[tuple(slc)]
要素数108000のデータを用意
len(df)
# 108000
pandasとnumpyでの時間比較
pandas
%%timeit
df['order'] = 0
df['order'] = df['order'].where(df['long']!=True,1)
df['order'] = df['order'].where(df['short']!=True,-1)
df['pos'] = df['order'].where(df['order']!=0,).fillna(method='ffill').fillna(0)
df['pos'] = df.groupby((df['pos']*df['pos'].shift(1)<0).cumsum().fillna(0))['order'].cumsum()
pyramiding = 3
df['pos'] = df['pos'].where(df['pos']<=pyramiding,pyramiding )
df['pos'] = df['pos'].where(df['pos']>=-pyramiding,-pyramiding )
# 31.7 ms ± 383 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
numpy
%%timeit
df['order'] = 0
df['order'] = np.where(df['short'], -1, np.where(df['long'], 1, df['order']))
df['pos'] = np.nan_to_num(np_ffill(np.where(df['order']==0, np.nan, df['order'])))
df['pos'] = df.groupby(np.nan_to_num(np_ffill(np.cumsum(df['pos'].to_numpy()*np_shift(df['pos'].to_numpy())<0))))['order'].cumsum()
pyramiding = 3
df['pos'] = np.where(df['pos']<=-pyramiding,-pyramiding,np.where(df['pos']>=pyramiding,pyramiding,df['pos']))
# 24.4 ms ± 819 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
データに違いがないかチェック
def pd_arr(df):
df['order'] = 0
df['order'] = df['order'].where(df['long']!=True,1)
df['order'] = df['order'].where(df['short']!=True,-1)
df['pos'] = df['order'].where(df['order']!=0,).fillna(method='ffill').fillna(0)
df['pos'] = df.groupby((df['pos']*df['pos'].shift(1)<0).cumsum().fillna(0))['order'].cumsum()
pyramiding = 3
df['pos'] = df['pos'].where(df['pos']<=pyramiding,pyramiding )
df['pos'] = df['pos'].where(df['pos']>=-pyramiding,-pyramiding )
return df
def np_arr(df):
df['order'] = 0
df['order'] = np.where(df['short'], -1, np.where(df['long'], 1, df['order']))
df['pos'] = np.nan_to_num(np_ffill(np.where(df['order']==0, np.nan, df['order'])))
df['pos'] = df.groupby(np.nan_to_num(np_ffill(np.cumsum(df['pos'].to_numpy()*np_shift(df['pos'].to_numpy())<0))))['order'].cumsum()
pyramiding = 3
df['pos'] = np.where(df['pos']<=-pyramiding,-pyramiding,np.where(df['pos']>=pyramiding,pyramiding,df['pos']))
return df
np.isclose(pd_arr(df), np_arr(df)).all()
# True
31.7 ms ± 383 µs ---> 24.4 ms ± 819 µs
思ってたより速くならない結果となりました
groupbyの部分が一番時間掛かってたのでその部分を頑張ってnumpy化すれば速くなりそうですが今日はここまで
7/11 numpyのdf['pos']の式が間違っていたところを修正
pandasのwhere()の第2引数はFalseのときの値を返す。
numpyのwhere()の第2引数はTrueのときの値を返す。第3引数はFalse
passしたいときは元のデータを渡す
# before
df['pos'] = np.where(df['pos']>=-pyramiding,-pyramiding, np.where(df['pos']<=pyramiding,pyramiding,df['pos']))
# after
df['pos'] = np.where(df['pos']<=-pyramiding,-pyramiding,np.where(df['pos']>=pyramiding,pyramiding,df['pos']))
------------------------------------- 番外編 -------------------------------------
numpy-groupies & numbaを使った場合
pip installで簡単にインストールできます。
numpyのコードを以下のように変更
# numpy
df['pos'] = df.groupby(np.nan_to_num(np_ffill(np.cumsum(df['pos'].to_numpy()*np_shift(df['pos'].to_numpy())<0))))['order'].cumsum()
# numpy-groupies & numba
import numpy_groupies as npg
group_index = np.nan_to_num(np_ffill(np.cumsum(df['pos'].to_numpy()*np_shift(df['pos'].to_numpy())<0)))
df['pos'] = npg.aggregate(group_index, df['order'].to_numpy(), func='cumsum')
%%timeit
df['order'] = 0
df['order'] = np.where(df['short'], -1, np.where(df['long'], 1, df['order']))
df['pos'] = np.nan_to_num(np_ffill(np.where(df['order']==0, np.nan, df['order'])))
group_index = np.nan_to_num(np_ffill(np.cumsum(df['pos'].to_numpy()*np_shift(df['pos'].to_numpy())<0)))
df['pos'] = npg.aggregate(group_index, df['order'].to_numpy(), func='cumsum')
pyramiding = 3
df['pos'] = np.where(df['pos']<=-pyramiding,-pyramiding,np.where(df['pos']>=pyramiding,pyramiding,df['pos']))
# 14.6 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
pandas ---> numpy ---> numpy-groupies & numba
31.7 ms ± 383 µs ---> 24.4 ms ± 819 µs ---> 14.6 ms ± 150 µs
pandasに比べて半分以下で終わるようになりました
使い方も簡単で累積和が知りたい場合は
npg.aggregate(グループ化したい配列、統計取りたい配列、func='cumsum')
とするだけです。