# Asset Price Prediction

##### Financial Prediction Model (FPM)

The FPM has applications in predicting volatile time-series information. The features can be of any kind as long as it is numerically represented. In many time-series models it is essentially to create hand engineered features, in this scenario lagging certain features lead to much better results than not doing so.

#### Finance

In finance a model similar to what is created here, can be used to predict the future movement of asset prices. As expected an extra tree regressor (or random forest if less constraint) including linear regressions on important variables led to low cross validated errors.

• Asset pricing prediction and modelling.
• Economic forecasts and decision making.
• Modelling a time series of a firm or individualâ€™s operating risk.

Similar to the finance applications, there are thousands of opportunities to improve your internal business management.

• Tracking and predicting on the time series of operational data, such as dataroom temperature, wind-turbine speed etc to increase performance and efficiency.
• Tracking and predicting asset values and similarly tracking inventory levels.

# Top 10% Financial Modeling Challenge
# https://www.kaggle.com/sonder/two-sigma-financial-modeling/d-play/run/946793

import kagglegym
import numpy as np
import pandas as pd
import random
from sklearn import ensemble, linear_model, metrics

env = kagglegym.make()
o = env.reset()
train = o.train
print(train.shape)
d_mean= train.median(axis=0)
train["nbnulls"]=train.isnull().sum(axis=1)
col=[x for x in train.columns if x not in ['id', 'timestamp', 'y']]

rnd=17

#keeping na information on some columns (best selected by the tree algorithms)
nas_cols=['technical_9', 'technical_0', 'technical_32', 'technical_16', 'technical_38',
'technical_44', 'technical_20', 'technical_30', 'technical_13']
#columns kept for evolution from one month to another (best selected by the tree algorithms)
diff_cols=['technical_22','technical_20', 'technical_30', 'technical_13', 'technical_34']

#homemade class used to infer randomly on the way the model learns
class createLinearFeatures:

def __init__(self, n_neighbours=1, max_elts=None, verbose=True, random_state=None):
self.rnd=random_state
self.n=n_neighbours
self.max_elts=max_elts
self.verbose=verbose
self.neighbours=[]
self.clfs=[]

def fit(self,train,y):
if self.rnd!=None:
random.seed(self.rnd)
if self.max_elts==None:
self.max_elts=len(train.columns)
list_vars=list(train.columns)
random.shuffle(list_vars)

lastscores=np.zeros(self.n)+1e15

for elt in list_vars[:self.n]:
self.neighbours.append([elt])
list_vars=list_vars[self.n:]

for elt in list_vars:
indice=0
scores=[]
for elt2 in self.neighbours:
if len(elt2)<self.max_elts:
clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1)
clf.fit(train[elt2+[elt]], y)
scores.append(metrics.mean_squared_error(y,clf.predict(train[elt2 + [elt]])))
indice=indice+1
else:
scores.append(lastscores[indice])
indice=indice+1
gains=lastscores-scores
if gains.max()>0:
temp=gains.argmax()
lastscores[temp]=scores[temp]
self.neighbours[temp].append(elt)

indice=0
for elt in self.neighbours:
clf=linear_model.LinearRegression(fit_intercept=False, normalize=True, copy_X=True, n_jobs=-1)
clf.fit(train[elt], y)
self.clfs.append(clf)
if self.verbose:
print(indice, lastscores[indice], elt)
indice=indice+1

def transform(self, train):
indice=0
for elt in self.neighbours:
#this line generates a warning. Could be avoided by working and returning
#with a copy of train.
#kept this way for memory management
train['neighbour'+str(indice)]=self.clfs[indice].predict(train[elt])
indice=indice+1
return train

def fit_transform(self, train, y):
self.fit(train, y)
return self.transform(train)

#a home-made class attempt to remove outliers by successive quantization on residuals
class recurrent_linear_approx():
def __init__(self, quant=.999, limit_size_train=.9):
self.quant=quant
self.limit_size_train=limit_size_train
self.bestmodel=[]

def fit(self, train, y):
internal_model=linear_model.Ridge(fit_intercept=False)
bestscore=1e15
better=True
indextrain=train.dropna().index
limitlen=len(train)*self.limit_size_train
while better:
internal_model.fit(train.ix[indextrain], y.ix[indextrain])
score=metrics.mean_squared_error(internal_model.predict(train.ix[indextrain]), y.ix[indextrain])
if score < bestscore:
bestscore=score
self.bestmodel=internal_model
residual=y.ix[indextrain]-internal_model.predict(train.ix[indextrain])
indextrain=residual[abs(residual)<=abs(residual).quantile(self.quant)].index
if len(indextrain)<limitlen:
better=False
else:
better=False
self.bestmodel=internal_model

def predict(self, test):
return self.bestmodel.predict(test)

for elt in nas_cols:
train[elt + '_na'] = pd.isnull(train[elt]).apply(lambda x: 1 if x else 0)
#no need to keep columns with no information
if len(train[elt + '_na'].unique())==1:
print("removed:", elt, '_na')
del train[elt + '_na']
nas_cols.remove(elt)

train=train.sort_values(by=['id','timestamp'])
for elt in diff_cols:
#a quick way to obtain deltas from one month to another but it is false on the first
#month of each id
train[elt+"_d"]= train[elt].rolling(2).apply(lambda x:x[1]-x[0]).fillna(0)
#removing month 0 to reduce the impact of erroneous deltas
train=train[train.timestamp!=0]

print(train.shape)
cols=[x for x in train.columns if x not in ['id', 'timestamp', 'y']]

#generation of linear models
cols2fit=['technical_22','technical_20', 'technical_30_d', 'technical_20_d', 'technical_30',
'technical_13', 'technical_34']
models=[]
columns=[]
residuals=[]
for elt in cols2fit:
print("fitting linear model on ", elt)
model=recurrent_linear_approx(quant=.99, limit_size_train=.9)
model.fit(train.loc[:,[elt]],train.loc[:, 'y'])
models.append(model)
columns.append([elt])
residuals.append(abs(model.predict(train[[elt]].fillna(d_mean))-train.y))

train=train.fillna(d_mean)

#adding all trees generated by a tree regressor
featureexpander=createLinearFeatures(n_neighbours=30, max_elts=2, verbose=True, random_state=rnd)
index2use=train[abs(train.y)<0.07].index
featureexpander.fit(train.ix[index2use,cols],train.ix[index2use,'y'])
trainer=featureexpander.transform(train[cols])
treecols=trainer.columns

print("training trees")
model = ensemble.ExtraTreesRegressor(n_estimators=29, max_depth=4, n_jobs=-1, random_state=rnd, verbose=0)
model.fit(trainer,train.y)
print(pd.DataFrame(model.feature_importances_,index=treecols).sort_values(by=[0]).tail(30))
for elt in model.estimators_:
models.append(elt)
columns.append(treecols)
residuals.append(abs(elt.predict(trainer)-train.y))

#model selection : create a new target selecting models with lowest asolute residual for each line
#the objective at this step is to keep only the few best elements which should
num_to_keep=10
targetselector=np.array(residuals).T
targetselector=np.argmin(targetselector, axis=1)
print("selecting best models:")

tokeepmodels=[]
tokeepcolumns=[]
tokeepresiduals=[]
for elt in tokeep:
tokeepmodels.append(models[elt])
tokeepcolumns.append(columns[elt])
tokeepresiduals.append(residuals[elt])

#creating a new target for a model in charge of predicting which model is best for the current line
targetselector=np.array(tokeepresiduals).T
targetselector=np.argmin(targetselector, axis=1)

print("training selection model")
modelselector = ensemble.ExtraTreesClassifier(n_estimators=30, max_depth=4, n_jobs=-1, random_state=rnd, verbose=0)
modelselector.fit(trainer, targetselector)
print(pd.DataFrame(modelselector.feature_importances_,index=treecols).sort_values(by=[0]).tail(30))

lastvalues=train[train.timestamp==905][['id']+diff_cols].copy()

print("end of trainind, now predicting")
indice=0
countplus=0
rewards=[]
while True:
indice+=1
test = o.features
test["nbnulls"]=test.isnull().sum(axis=1)
for elt in nas_cols:
test[elt + '_na'] = pd.isnull(test[elt]).apply(lambda x: 1 if x else 0)
test=test.fillna(d_mean)

pred = o.target
#creating deltas from lastvalues
indexcommun=list(set(lastvalues.id) & set(test.id))
lastvalues=pd.concat([test[test.id.isin(indexcommun)]['id'],
pd.DataFrame(test[diff_cols][test.id.isin(indexcommun)].values-lastvalues[diff_cols][lastvalues.id.isin(indexcommun)].values,
columns=diff_cols, index=test[test.id.isin(indexcommun)].index)],
axis=1)
test=test.merge(right=lastvalues, how='left', on='id', suffixes=('','_d')).fillna(0)
#storing new lastvalues
lastvalues=test[['id']+diff_cols].copy()

testid=test.id
test=featureexpander.transform(test[cols])
#prediction using modelselector and models list
selected_prediction = modelselector.predict_proba(test.loc[: ,treecols])
for ind,elt in enumerate(tokeepmodels):
pred['y']+=selected_prediction[:,ind]*elt.predict(test[tokeepcolumns[ind]])

indexbase=pred.index
pred.index=testid
oldpred=pred['y']
pred.index=indexbase

o, reward, done, info = env.step(pred)
rewards.append(reward)
if reward>0:
countplus+=1

if indice%100==0:
print(indice, countplus, reward, np.mean(rewards))

if done:
print(info["public_score"])
break


Following is two scripts that are in most parts different, which also scored high.

# Rollng Regression Script: top 14%

import kagglegym
import numpy as np
import pandas as pd
from sklearn import linear_model as lm
from sklearn.ensemble import ExtraTreesRegressor

# The "environment" is our interface for code competitions
env = kagglegym.make()
# We get our initial observation by calling "reset"
o = env.reset()
# Get the train dataframe

excl = [env.ID_COL_NAME, env.SAMPLE_COL_NAME, env.TARGET_COL_NAME, env.TIME_COL_NAME]
col = [c for c in o.train.columns if c not in excl]

train = train[col]
d_mean= train.median(axis=0)

train = o.train[col]
n = train.isnull().sum(axis=1)
for c in train.columns:
train[c + '_nan_'] = pd.isnull(train[c])
d_mean[c + '_nan_'] = 0
train_1 = train.fillna(d_mean)
train_1['znull'] = n
n = []

rfr = ExtraTreesRegressor(n_estimators=30, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
model1 = rfr.fit(train_1, o.train['y'])

train = o.train
cols_to_use = ['technical_20','technical_30','technical_13','y']
excl = ['id', 'y', 'timestamp']
allcol = [c for c in train.columns if ((c in excl)|(c in cols_to_use))]
allcol1 = [c for c in allcol if not (c == 'y')]
train=train[allcol]

low_y_cut = -0.075
high_y_cut = 0.075
y_is_above_cut = (o.train.y > high_y_cut)
y_is_below_cut = (o.train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

# mean_values = train.median(axis=0)
#train.fillna(mean_values, inplace=True)

pred = np.array(train[cols_to_use])
tis=np.array(train.loc[:, 'timestamp'],dtype=int)
ids=np.array(train.loc[:, 'id'],dtype=int)
del train

predtab=np.zeros((max(tis)+1,max(ids)+1,pred.shape[1]))
predtab[:,:,:]=np.nan
for c in range(0,max(ids)+1) :
sel = np.array(ids==c)
predtab[tis[sel],c,:]=pred[sel,:]
del pred,tis,ids

gconst = [1,-1]
for iter in range(0,2):
dt=gconst[0]*predtab[:-1,:,0:3]+gconst[1]*predtab[1:,:,0:3]
trg=predtab[:-1,:,-1]
ok=np.array((np.sum(np.isnan(dt),axis=2)==0)&np.isfinite(trg)&(trg<high_y_cut)&(trg>low_y_cut))
met1=lm.LinearRegression()
dt = dt[np.repeat(ok.reshape((ok.shape[0],ok.shape[1],1)),dt.shape[2],axis=2)].reshape(np.sum(ok),dt.shape[2])
met1.fit (dt,trg[ok])
r2 = met1.score(dt,trg[ok])
dconst = met1.coef_
print('Dconst=',dconst,' R=',np.sqrt(r2))

dt=np.dot(predtab[:,:,0:3],dconst).reshape((predtab.shape[0],predtab.shape[1],1))
dt=np.concatenate((dt[:-1,:,:],dt[1:,:,:]),axis=2)
ok=np.array((np.sum(np.isnan(dt),axis=2)==0)&np.isfinite(trg)&(trg<high_y_cut)&(trg>low_y_cut))
met1=lm.LinearRegression()
dt = dt[np.repeat(ok.reshape((ok.shape[0],ok.shape[1],1)),dt.shape[2],axis=2)].reshape(np.sum(ok),dt.shape[2])
met1.fit (dt,trg[ok])
r2 = met1.score(dt,trg[ok])
gconst = met1.coef_
print('Gconst=',gconst,' R=',np.sqrt(r2))
del dt, trg, ok

def expandmas2 (mas,n):
if (mas.shape[1]<=n):
mas1=np.zeros((mas.shape[0],int(n*1.2+1)))
for i in range(0,mas.shape[0]):
mas1[i,:]=mas[i,-1]
mas1[:,:mas.shape[1]]=mas
mas = mas1
return mas
def expandmas (mas,n,m):
if (mas.shape[0]<=n):
mas1=np.zeros((int(n*1.2+1),mas.shape[1],mas.shape[2]))
mas1[:,:,:]=np.nan
mas1[:mas.shape[0],:mas.shape[1],:]=mas
mas = mas1
if (mas.shape[1]<=m):
mas1=np.zeros((mas.shape[0],int(m*1.2+1),mas.shape[2]))
mas1[:,:,:]=np.nan
mas1[:mas.shape[0],:mas.shape[1],:]=mas
mas = mas1
return mas

realhist = predtab.shape[0]
coef = np.zeros((1,realhist))
def trainrolling (tset):
for t in tset :
s0=max(t-1,1)
y=predtab[s0,:,-1]
x=predtab[s0-1,:,-1]
ok=np.array(np.isfinite(x)&np.isfinite(y)&(x>low_y_cut)&(x<high_y_cut)&(y<high_y_cut)&(y>low_y_cut))
#            ok=np.array(np.isfinite(x)&np.isfinite(y))
if np.sum(ok)==0:
coef[0,t]=coef[0,t-1]
else:
x1=x[ok]
y1=y[ok]
#                    alp1=0.65*(np.std(x1)+np.std(y1))*max(200,x1.shape[0])
alp1=np.std(np.concatenate((x1,y1)))*max(200,x1.shape[0])
x1=np.concatenate((x1,[alp1]))
y1=np.concatenate((y1,[alp1*coef[0,t-1]]))
coef[0,t]=np.sum(x1*y1)/np.sum(x1*x1)
if t>=1:
res = predtab[t-1,:,-1]*coef[0,t]
return res,coef

reward=0
n = 0
rewards = []
t0 = 0
while True:
test = o.features[allcol1].copy()
#    test['id'] = observation.target.id
#    test.fillna(mean_values, inplace=True)
pred=np.array(test[cols_to_use[:-1]])
maxts = int(max(test['timestamp']))
maxid = int(max(test['id']))
predtab=expandmas (predtab,maxts,maxid)
coef =expandmas2 (coef,maxts)

resout = np.zeros((pred.shape[0]))
for t in range(int(min(test['timestamp'])),maxts+1) :
sel=np.array(test['timestamp']==t)
ids=np.array(test.loc[sel,'id'],dtype=int)

predtab[t,ids,0:pred.shape[1]]=pred[sel,:]
if (t<1):
continue
old = predtab[t-1,ids,-1]
#        new = np.dot(predtab[t,ids,0:3]-predtab[t-1,ids,0:3],dconst)
new = np.dot(predtab[t-1:t+1,ids,0:3],dconst)
new = np.dot(new.T,gconst)
old[np.isnan(old)]=new[np.isnan(old)]
predtab[t-1,ids,-1]=old
t0=int(min(t0,t-1))

res,coef = trainrolling (range(t0+1,t+1))
res = res[ids]
res [np.isnan(res)]=0.
resout[sel]=res
t0=t
test = o.features[col]
n = test.isnull().sum(axis=1)
for c in test.columns:
test[c + '_nan_'] = pd.isnull(test[c])
test = test.fillna(d_mean)
test['znull'] = n

o.target.y = (resout.clip(low_y_cut, high_y_cut)*0.34) + (model1.predict(test).clip(low_y_cut, high_y_cut) * 0.66)
o.target.y = o.target.y
#observation.target.fillna(0, inplace=True)
target = o.target
timestamp = o.features["timestamp"][0]
if timestamp % 100 == 0:
print("Timestamp #{}".format(timestamp))
print(np.mean(rewards))

o, reward, done, info = env.step(target)
#    print(reward)
if done:
break
rewards.append(reward)
n = n + 1
print(info)

# Outliers Script top 20%

import kagglegym
import numpy as np
import pandas as pd

# sklearn libraries
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 120)

environment = kagglegym.make() # This creates an environment in the API for me to play in
observation = environment.reset() # Resets to first observations "view of what you can see presently"

excl = [environment.ID_COL_NAME, environment.TARGET_COL_NAME, environment.TIME_COL_NAME,
environment.SAMPLE_COL_NAME]
col = [c for c in observation.train.columns if c not in excl]

from scipy import stats

df_old = observation.train
df = df_old[(np.abs(stats.zscore(df_old["y"])) < 3.6)]
#df = observation.train
d_mean= df[col].mean(axis=0)

min_y = df["y"].min()
max_y = df["y"].max()
print (min_y, max_y)

X_train =df[col]
n = X_train.isnull().sum(axis=1)

for c in col:
r = pd.isnull(X_train.loc[:, c])
X_train[c + '_nan_'] = r
d_mean[c + '_nan_'] = 0

X_train = X_train.fillna(d_mean)
df = df.fillna(d_mean)
X_train['znull'] = n
n = []

cols_to_use = ['technical_30', 'technical_20', 'fundamental_11', 'technical_19']
"""['technical_30', 'technical_20', 'fundamental_11', 'technical_27', 'technical_19', 'technical_35',
'technical_11', 'technical_2', 'technical_34', 'fundamental_53', 'fundamental_51',
'fundamental_58']"""

#observation = environment.reset()

rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
fit_one = rfr.fit(X_train, df["y"].values)

lr = LinearRegression()
# See what happend if you change this to X_full
fit_two = lr.fit(np.array(df[cols_to_use]), df["y"].values)

X_train= []
ymean_dict = dict(observation.train.groupby(["id"])["y"].mean())

#observation = environment.reset()

while True:
X_test = observation.features[col]
#I reckoned what happened here is that the features column is a different set of data.
n = X_test.isnull().sum(axis=1)
for c in X_test.columns:
X_test[c + '_nan_'] = pd.isnull(X_test[c])
X_test = X_test.fillna(d_mean)
X_test['znull'] = n

temp = observation.features.fillna(d_mean)
X_test_two = np.array(temp[cols_to_use])

pred = observation.target

pred['y'] = (fit_one.predict(X_test).clip(min_y, max_y) * 0.65)
+ (fit_two.predict(X_test_two).clip(min_y, max_y)* 0.35)
pred['y'] = pred.apply(lambda r: 0.95 * r['y'] + 0.05 * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
pred['y'] = [float(format(x, '.6f')) for x in pred['y']]

timestamp = temp["timestamp"][0]
if timestamp % 100 == 0:
print("Timestamp #{}".format(timestamp))

observation, reward, done, info = environment.step(pred)
if done:
break
info

#0.0115