#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_cell_magic('capture', '', '\nimport numpy as np\nimport pandas as pd\nimport xarray as xr\n\nimport plotly.offline as py\nimport plotly.graph_objs as go\npy.init_notebook_mode()\nfrom IPython.display import Image\n\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import confusion_matrix\nfrom keras.models import Sequential\nfrom keras.layers import Dense\n\nfrom tqdm import tqdm\n\nfrom trackml.dataset import load_event\n')


# I won't explain the content of this competition here, everything can be found on the kaggle site: https://www.kaggle.com/c/trackml-particle-identification. However, I will only analyze one single event in this notebook (because I don't have a lot of computing power at hand). This event is visualized in a few graphs below.

# In[2]:


# Load data
hits, cells, particles, truth = load_event("./train_100_events/event000001000")
truth.index = truth.hit_id
hits.index = hits.hit_id

# Add "distance from origin"-column and sort by it
truth['dist'] = (truth.tx**2 + truth.ty**2 + truth.tz**2) ** (1/2)
truth = truth.sort_values('dist')

# Plot all hits
trace1 = go.Scatter3d(x=truth.tx, y=truth.ty, z=truth.tz, mode='markers', opacity=0.3, name="Hits")
trace2 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2], layout=layout)
# py.iplot(fig)

Image('fig1.png')


# In[3]:


# Two test tracks
track1 = truth[truth.particle_id == truth.loc[20906].particle_id]
track2 = truth[truth.particle_id == truth.loc[22405].particle_id]

track1


# In[4]:


track2


# In[5]:


# Visualize test tracks

trace1 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', name="Track 1")
trace2 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', name="Track 2")
trace3 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
#py.iplot(fig)

Image('fig2.png')


# Now the task is to find these two (and all of the other) tracks from the unannotated mess of points above.
# A logical first step is to look for the first hits of the track. One might expect these to be the ones closest to the origin point, but because of the cylindrical structure of the detector, distance isn't the way to go, as can be seen below, where all _actual_ first hits are plottet (for orientation, both example tracks are also plottet):

# In[6]:


# True first hits

first_true = truth.drop_duplicates(subset='particle_id')

trace1 = go.Scatter3d(x=first_true.tx, y=first_true.ty, z=first_true.tz, mode='markers', opacity=0.3, name="True first hits")
trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', opacity=1, name="Track 2")
trace4 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
#py.iplot(fig)

Image('fig3.png')


# It is evident that most first hits lie on the innermost cylindrical detector, as is to be expected. However, a lot of random mess can be seen everywhere, where the particle apparently was not detected by detectors closer to the origin. To predict these first hits, I used a random forest classifier.

# In[7]:


# Predict first hits

X = truth
y = truth.hit_id.isin(first_true.hit_id)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4)

model_first = RandomForestClassifier()
model_first.fit(X_train, y_train)

print("Confusion Matrix of test data:\n", confusion_matrix(y_test, model_first.predict(X_test)))


# These results are not great, but they would probably get better if more data was used to train the classifier. Below is a plot of what the model predicts as the first hits, but remember that it was trained on 3/4 of exactly these points, so if it gets a lot correct that is not coincidence or brilliance, but just the fact that the model was fit to this data.

# In[8]:


# Predicted first hits

y_pred = model_first.predict(truth)
first_pred = truth[y_pred]

trace1 = go.Scatter3d(x=first_pred.tx, y=first_pred.ty, z=first_pred.tz, mode='markers', opacity=0.3, name="Predicted first hits")
trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', opacity=1, name="Track 2")
trace4 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
#py.iplot(fig)

Image('fig4.png')


# In[9]:


# Similarity

print("Total number of predicted first hits:", y_pred.sum())
print("Total errors made:", (y_pred ^ y).sum())
print("{:f}% of the prediction is correct".format((len(y) - (y_pred ^ y).sum()) / len(y)))


# With that, we can now turn to the second hits. This is where it gets more tricky, and the huge amount of data really becomes a problem. If one were to look for a second hit for every of our predicted first hits among all of the data, there would be far too many possibilities:

# In[10]:


num = len(y_pred) * (len(truth) - 1)    # We can exclude the first hit itself as second hit 🎉
print("Number of possibilities:", num)


# Now if each of the calculations took only ten microsecond, it would still take:

# In[11]:


print(pd.Timedelta('10 us') * num)


# to predict all possible second hits for one event! Now suppose we would take the ten most likely second hits per first hit and predict the third hits for those, that would then take

# In[12]:


# This time, we can exclude two hits on our search for the third one!
num = len(y_pred) * 10 * (len(truth) - 2)
print(pd.Timedelta('10 us') * num)


# Alright, that is definetely too long. But, because we are not looking for a random point among all other ones, we can just exclude most points when looking for a second hit. For example, we know the particle physically has to continue moving away from the origin. That already excludes all hits that are closer to the origin than our first hit. Also, the second hit won't be too far away from the first hit, we can calculate the average distance of the two:

# In[13]:


l = np.array([])
for _, hits in truth.groupby("particle_id"):
    if len(hits) > 1:
        l = np.append(l, [np.sqrt((hits.tx.iloc[1] - hits.tx.iloc[0])**2 
                                   + (hits.ty.iloc[1] - hits.ty.iloc[0])**2 
                                   + (hits.tz.iloc[1] - hits.tz.iloc[0])**2)])

print(f"The average distance between first and second hit is {l.mean():.2f} "
      f"with a standard deviation of {l.std():.2f}.")


# Now we can define a cone with the first hit being the starting point in which the second point must be. We can use efficient linear algebra libraries to rapidly calculate these cones:

# In[14]:


get_ipython().run_cell_magic('time', '', "\n# First layer, I only use 5000 points because otherwise my laptop crashes.\nlayer = first_true.iloc[1:5001]\n\n# Put all hits into a tensor with coordinates hit_id, and prev; prev here is the hit_id of the previous hit (namely the first hit)\nx = xr.DataArray(np.resize(truth.tx.values, (len(layer), len(truth))).T, coords={'hit_id': truth.hit_id.values, 'prev': layer.hit_id.values}, dims=['hit_id', 'prev'])\ny = xr.DataArray(np.resize(truth.ty.values, (len(layer), len(truth))).T, coords={'hit_id': truth.hit_id.values, 'prev': layer.hit_id.values}, dims=['hit_id', 'prev'])\nz = xr.DataArray(np.resize(truth.tz.values, (len(layer), len(truth))).T, coords={'hit_id': truth.hit_id.values, 'prev': layer.hit_id.values}, dims=['hit_id', 'prev'])\nds = x.to_dataset(name='x')\nds['y'] = y\nds['z'] = z\n\n# Put first hits in similar shape as all other hits (xarray)\nX = layer[['tx', 'ty', 'tz']].to_xarray()\nX = X.rename({'hit_id': 'prev', 'tx': 'x', 'ty': 'y', 'tz': 'z'})\n\n# Calculate the angle between all first hits and possible second hits\nxp = ds - X    # Elements of xp are the vectors between the first hit (x) and possible second hit (p)\nxpl = (xp.x**2 + xp.y**2 + xp.z**2)**(1/2)\np = xp/xpl     # Norm all vectors\n\nox = X         # Elements of ox are the vectors between the origin (o) and the first hit (x)\noxl = (ox.x**2 + ox.y**2 + ox.z**2)**(1/2)\nn = ox/oxl     # Norm all vectors\n\ns = n * p      # Calculate cosine of angles between points\n\n# Exclude all hits where the angle is > 36.9° or the distance is < 300\ncones = ((((s.x + s.y + s.z) > 0.8)) & (xpl < 300)).T\n")


# To illustrate this result I will plot the cone of the first test track below:

# In[15]:


# Visualize cones
cone1 = truth[cones.sel(prev=20906).values]
trace1 = go.Scatter3d(x=cone1.tx, y=cone1.ty, z=cone1.tz, mode='markers', opacity=0.3, name="Cone")
trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track")
trace3 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")
layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
#py.iplot(fig)

Image('fig5.png')


# Now these cones we can work with. A simple neural net can classify between "looks like a second hit" and "does not look like a second hit" among the hits inside a cone. To train the net I will use the second hit as a positive example and 10 random hits from the cone as negative example, so that the distribution isn't too skewed towards negative examples.

# In[16]:


# Process data for NN

remains = ds.where(cones)    # Hits inside cones

X = np.zeros(6)
y = np.array([])

for fhit in tqdm(ds.prev):
    # Get 10 random hits from cone
    remain = remains.sel(prev=fhit).dropna('hit_id')
    remain = remain.sel(hit_id=np.random.choice(remain.hit_id.values, 10)).to_dataframe()
    
    # Add the one true second hit
    try:
        t = truth[truth.particle_id == truth.particle_id.loc[int(fhit)]].iloc[1]
    except:
        continue
    
    remain = remain[remain.index != t.hit_id]
    prev = truth.loc[np.full((len(remain),), fhit)]
    y = np.concatenate((y, len(remain)*[0], [1]))
    X = np.vstack([X, np.vstack([prev.tx, prev.ty, prev.tz, remain.x.values, remain.y.values, remain.z.values]).T, 
                   [prev.tx.iloc[0], prev.ty.iloc[0], prev.tz.iloc[0], t.tx, t.ty, t.tz]])

X = X[1:]


# In[17]:


# Train the NN

model_second = Sequential()
model_second.add(Dense(64, activation='relu', input_shape=(6,)))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(64, activation='relu'))
model_second.add(Dense(1, activation='sigmoid'))
model_second.compile(optimizer='rmsprop', loss='binary_crossentropy')
model_second.fit(X, y, epochs=5)


# In[18]:


# Test the NN on the two test tracks

cone1 = truth[cones.sel(prev=20906).values][['tx', 'ty', 'tz']].values
X1 = np.hstack([np.repeat(truth.loc[20906][['tx', 'ty', 'tz']].values[np.newaxis], len(cone1), axis=0), cone1])
second_1 = cone1[np.argmax(model_second.predict(X1))]

cone2 = truth[cones.sel(prev=22405).values][['tx', 'ty', 'tz']].values
X2 = np.hstack([np.repeat(truth.loc[22405][['tx', 'ty', 'tz']].values[np.newaxis], len(cone2), axis=0), cone2])
second_2 = cone2[np.argmax(model_second.predict(X2))]

trace1 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")

trace2 = go.Scatter3d(x=track1.tx.iloc[2:], y=track1.ty.iloc[2:], z=track1.tz.iloc[2:], mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=[track1.tx.iloc[0]], y=[track1.ty.iloc[0]], z=[track1.tz.iloc[0]], mode='markers', opacity=1, name="First hit 1")
trace4 = go.Scatter3d(x=[track1.tx.iloc[1]], y=[track1.ty.iloc[1]], z=[track1.tz.iloc[1]], mode='markers', opacity=1, name="Second hit 1 (true)")
trace5 = go.Scatter3d(x=[second_1[0]], y=[second_1[1]], z=[second_1[2]], mode='markers', opacity=1, name="Second hit 1 (predicted)")

trace6 = go.Scatter3d(x=track2.tx.iloc[2:], y=track2.ty.iloc[2:], z=track2.tz.iloc[2:], mode='markers', opacity=1, name="Track 2")
trace7 = go.Scatter3d(x=[track2.tx.iloc[0]], y=[track2.ty.iloc[0]], z=[track2.tz.iloc[0]], mode='markers', opacity=1, name="First 2 hit")
trace8 = go.Scatter3d(x=[track2.tx.iloc[1]], y=[track2.ty.iloc[1]], z=[track2.tz.iloc[1]], mode='markers', opacity=1, name="Second 2 hit (true)")
trace9 = go.Scatter3d(x=[second_2[0]], y=[second_2[1]], z=[second_2[2]], mode='markers', opacity=1, name="Second hit 2 (predicted)")

layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4, trace5, trace6, trace7, trace8, trace9], layout=layout)
#py.iplot(fig)

Image('fig6.png')


# The model accurately predicts the second hit of the first track, but fails to predict the second hit of the second track, though on a previous trial the same network predicted the second hit of the first track accurately.
# 
# To predict the third and subsequent hits we will use neural nets that use the previous hits as input and output the predicted position in space of the next hit. Then we can use the closest available hit as a prediction.

# In[19]:


# Process data for NN

X = np.zeros(6)
y = np.zeros(3)

for fhit in tqdm(ds.prev):
    t = truth[truth.particle_id == truth.particle_id.loc[int(fhit)]]
    try:
        y = np.vstack([y, np.r_[t.iloc[2].tx, t.iloc[2].ty, t.iloc[2].tz]])
        X = np.vstack([X, np.r_[t.iloc[0].tx, t.iloc[0].ty, t.iloc[0].tz, 
                                t.iloc[1].tx, t.iloc[1].ty, t.iloc[1].tz]])
    except:
        continue
    
X = X[1:]
y = y[1:]


# In[20]:


# Train the NN

model_third = Sequential()
model_third.add(Dense(64, activation='relu', input_shape=(6,)))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(64, activation='relu'))
model_third.add(Dense(3))
model_third.compile(optimizer='rmsprop', loss='mse')
model_third.fit(X, y, epochs=20)


# In[21]:


# Test the NN on the two test tracks

X1 = np.r_[track1.tx.iloc[0], track1.ty.iloc[0], track1.tz.iloc[0], track1.tx.iloc[1], track1.ty.iloc[1], track1.tz.iloc[1]]
third_1 = model_third.predict(X1[np.newaxis])[0]    # Predicted position
third_1 = truth.iloc[np.argmin(np.sqrt(np.sum((truth[['tx', 'ty', 'tz']].values - third_1)**2, axis=1)))]    # Nearest hit

X2 = np.r_[track2.tx.iloc[0], track2.ty.iloc[0], track2.tz.iloc[0], track2.tx.iloc[1], track2.ty.iloc[1], track2.tz.iloc[1]]
third_2 = model_third.predict(X2[np.newaxis])[0]
third_2 = truth.iloc[np.argmin(np.sqrt(np.sum((truth[['tx', 'ty', 'tz']].values - third_2)**2, axis=1)))]    # Nearest hit

trace1 = go.Scatter3d(x=[0], y=[0], z=[0], mode='markers', opacity=1, name="Origin")

trace2 = go.Scatter3d(x=track1.tx, y=track1.ty, z=track1.tz, mode='markers', opacity=1, name="Track 1")
trace3 = go.Scatter3d(x=[track1.tx.iloc[2]], y=[track1.ty.iloc[2]], z=[track1.tz.iloc[2]], mode='markers', opacity=1, name="Third hit 1 (true)")
trace4 = go.Scatter3d(x=[third_1.tx], y=[third_1.ty], z=[third_1.tz], mode='markers', opacity=1, name="Third hit 1 (predicted)")

trace5 = go.Scatter3d(x=track2.tx, y=track2.ty, z=track2.tz, mode='markers', opacity=1, name="Track 2")
trace6 = go.Scatter3d(x=[track2.tx.iloc[2]], y=[track2.ty.iloc[2]], z=[track2.tz.iloc[2]], mode='markers', opacity=1, name="Third hit 2 (true)")
trace7 = go.Scatter3d(x=[third_2.tx], y=[third_2.ty], z=[third_2.tz], mode='markers', opacity=1, name="Third hit 2 (predicted)")

layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[trace1, trace2, trace3, trace4, trace5, trace6, trace7], layout=layout)
#py.iplot(fig)

Image('fig7.png')


# The model fails to predict the third hit for the second track, but correctly predicts the third hit for the first track. Again, the model probably would get better if it were trained on a lot more data, but this algorithm is also very rudimentary and can be improved a great deal if more time was invested.
# 
# However, I have already missed the deadline on this kaggle competition and won't continue work from here on.
# Thank you very much for reading :)