import pandas as pd
import numpy as np
#Get and filter dataset
df = pd.read_csv('GSM3130435_egfp_unmod_1.csv')
df = df.query("total_reads >= 1000").copy().reset_index(drop=True)
df = df.sort_values('rl', ascending=False).copy().reset_index(drop=True)
df_top = df.iloc[:10000][['utr', 'rl']].copy().reset_index(drop=True)
shuffle_index = np.arange(len(df_top))
np.random.shuffle(shuffle_index)
df_top = df_top.iloc[shuffle_index].copy().reset_index(drop=True)
print("len(df_top) = " + str(len(df_top)))
len(df_top) = 10000
#Store sorted subsample of data
df_top.to_csv("optimus5_seqs_strong_high_readcount.csv", index=False, sep="\t")