import pandas as pd
res = pd.read_csv("species_cranky_wozniak", sep = "\t", header = None,
names = ['taxid', 'rank', 'taxpath', 'taxpathsn', 'percentage'],
skiprows= 1)
res.head()
taxid | rank | taxpath | taxpathsn | percentage | |
---|---|---|---|---|---|
0 | 56636 | species | 2157|28889|183924|114380|2272|56635|56636 | Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... | 0.0 |
1 | 477693 | species | 2157|28889|183924|114380|2272|2273|477693 | Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... | 0.0 |
2 | 160233 | species | 2157|28889|183924|114380|2272|54258|160233 | Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... | 0.0 |
3 | 54248 | species | 2157|28889|183924|114380|2307|54247|54248 | Archaea|Crenarchaeota|Thermoprotei|Desulfuroco... | 0.0 |
4 | 2285 | species | 2157|28889|183924|2281|118883|2284|2285 | Archaea|Crenarchaeota|Thermoprotei|Sulfolobale... | 0.0 |
res.shape
(1029, 5)
def summarize_all_levels(df, ranks):
new_rows = []
for (_, _, taxpath, taxpathsn, percentage) in df.itertuples(index=False, name=None):
lineage_values = taxpath.split("|")
lineage_names = taxpathsn.split("|")
for i, (rank, tax_id) in enumerate(zip(ranks[:-1], lineage_values), 1):
taxpath = "|".join(lineage_values[:i])
taxpathsn = "|".join(lineage_names[:i])
new_rows.append([tax_id, rank, taxpath, taxpathsn, percentage])
new_df = pd.DataFrame(new_rows, columns=df.columns)
return new_df.groupby(
['taxid', 'rank', 'taxpath', 'taxpathsn'], as_index=False
).sum()
tax_ranks = ranks = "superkingdom|phylum|class|order|family|genus|species".split("|")
new_df = summarize_all_levels(res, tax_ranks)
new_df
taxid | rank | taxpath | taxpathsn | percentage | |
---|---|---|---|---|---|
0 | 10 | genus | 2|1224|1236|72274|135621|10 | Bacteria|Proteobacteria|Gammaproteobacteria|Ps... | 0.010060 |
1 | 100715 | genus | 2|1090|191410|191411|191412|100715 | Bacteria|Chlorobi|Chlorobia|Chlorobiales|Chlor... | 0.000000 |
2 | 100883 | genus | 2|1239|526524|526525|128827|100883 | Bacteria|Firmicutes|Erysipelotrichia|Erysipelo... | 0.000000 |
3 | 1016 | genus | 2|976|117743|200644|49546|1016 | Bacteria|Bacteroidetes|Flavobacteriia|Flavobac... | 0.000000 |
4 | 1021 | genus | 2|1224|1236|72273|135617|1021 | Bacteria|Proteobacteria|Gammaproteobacteria|Th... | 0.000000 |
... | ... | ... | ... | ... | ... |
1033 | 97050 | genus | 2|1224|28211|204455|31989|97050 | Bacteria|Proteobacteria|Alphaproteobacteria|Rh... | 0.191146 |
1034 | 972 | family | 2|1239|186801|53433|972 | Bacteria|Firmicutes|Clostridia|Halanaerobiales... | 0.010060 |
1035 | 976 | phylum | 2|976 | Bacteria|Bacteroidetes | 2.263578 |
1036 | 978 | genus | 2|976|768503|768507|89373|978 | Bacteria|Bacteroidetes|Cytophagia|Cytophagales... | 0.010060 |
1037 | 995019 | family | 2|1224|28216|80840|995019 | Bacteria|Proteobacteria|Betaproteobacteria|Bur... | 0.010060 |
1038 rows × 5 columns
sum(res['percentage'])
94.30576299999994