# download uniprot ID mapping
#! wget --directory-prefix download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
! shasum download/idmapping.dat.gz
This file has three columns, delimited by tab:
where ID_type is the database name as appearing in UniProtKB cross-references, and as supported by the ID mapping tool on the UniProt web site, http://www.uniprot.org/mapping and where ID is the identifier in that cross-referenced database.
import os
import csv
import io
import gzip
def generate_idmapping(path):
"""Returns a generator of idmapping.dat.gz rows."""
read_file = gzip.open(path, 'rb')
text = io.TextIOWrapper(read_file)
reader = csv.reader(text, delimiter='\t')
for row in reader:
yield row
read_file.close()
path = os.path.join('download', 'idmapping.dat.gz')
mapping_generator = generate_idmapping(path)
extract = {'GeneID', 'HGNC'}
mappings = {target: set() for target in extract}
for accession, target, target_id in mapping_generator:
if target not in extract:
continue
mappings[target].add((accession, target_id))
for target, mapset in mappings.items():
path = os.path.join('data', 'map', '{}.tsv.gz'.format(target))
write_file = gzip.open(path, 'wb')
wrapper = io.TextIOWrapper(write_file)
writer = csv.writer(wrapper, delimiter='\t')
writer.writerow(['uniprot', target])
writer.writerows(sorted(mapset))
write_file.close()