!head -10 pr_pr_bn32.m4 import sys import os class read_node_0: #experimenting on keeping the data in a list rather than a dictionary, not used def __init__(self, name, length): self.name = name self.length = length self.hits = [] self.hit_map = None def add_hit(self, aln_info): self.hits.append ( aln_info ) def __getitem__(self, target_name): if self.hit_map == None: self.hit_map = dict(zip( [h[0] for h in self.hits], self.hits ) ) return self.hit_map[target_name] class read_node: def __init__(self, name, length): self.name = name self.length = length self.hit_map = {} def add_hit(self, aln_info): self.hit_map[aln_info[0]] = aln_info def __getitem__(self, target_name): return self.hit_map[target_name] @property def hits(self): #another convenient representation of the same data return self.hit_map.values() def get_overlap_data(m4_filename): containment_tolerance = 50 permitted_error_pct = 2 overlap_data = {} contained_reads = set() with open(m4_filename) as m4_f: for l in m4_f: l = l.strip().split() q_name, t_name =l[0:2] if q_name == t_name: continue aln_score = int(l[2]) aln_idt = float(l[3]) if aln_idt < 100-permitted_error_pct: continue q_strand, q_start, q_end, q_len = ( int(x) for x in l[4:8]) t_strand, t_start, t_end, t_len = ( int(x) for x in l[8:12]) if q_len - (q_end - q_start) < containment_tolerance: contained_reads.add(q_name) if t_len - (t_end - t_start) < containment_tolerance: contained_reads.add(t_name) if q_name not in overlap_data: overlap_data[ q_name ] = read_node(q_name, q_len) assert q_strand == 0 if t_name not in [x[0] for x in overlap_data[ q_name ].hits]: overlap_data[ q_name ].add_hit( (t_name, aln_score, aln_idt, (q_strand, q_start, q_end, q_len), (t_strand, t_start, t_end, t_len) ) ) #symmetrized the alignment record if t_name not in overlap_data: overlap_data[ t_name ] = read_node(t_name, t_len) if q_name not in [x[0] for x in overlap_data[ t_name ].hits]: if t_strand == 1: t_start, t_end = t_len - t_end, t_len - t_start q_start, q_end = q_len - q_end, q_len - q_start overlap_data[ t_name ].add_hit( (q_name, aln_score, aln_idt, (0, t_start, t_end, t_len), (1, q_start, q_end, q_len) ) ) else: overlap_data[ t_name ].add_hit( (q_name, aln_score, aln_idt, (0, t_start, t_end, t_len), (0, q_start, q_end, q_len) ) ) return overlap_data, contained_reads overlap_data, contained_reads = get_overlap_data("pr_pr_bn32.m4") def generate_overlap_gml(overlap_data, contained_reads, gml_filename): containment_tolerance = 50 permitted_error_pct = 2 import networkx as nx G=nx.DiGraph() node_in_graph = set() for q_name in overlap_data: if q_name in contained_reads: continue if q_name not in node_in_graph: G.add_node(q_name) targets = overlap_data[ q_name ].hits targets_3prime = [ h for h in targets if h[4][1] < containment_tolerance and h[0] not in contained_reads] targets_5prime = [ h for h in targets if h[3][1] < containment_tolerance and h[0] not in contained_reads] targets_3prime.sort(key = lambda k:k[1]) targets_5prime.sort(key = lambda k:k[1]) if len(targets_3prime) > 0: t = targets_3prime[0] t_name = t[0] if t_name not in node_in_graph: G.add_node(t_name) G.add_edge(q_name, t_name) if len(targets_5prime) > 0: t = targets_5prime[0] t_name = t[0] if t_name not in node_in_graph: G.add_node(t_name) G.add_edge(q_name, t_name) nx.write_gml(G, gml_filename) generate_overlap_gml(overlap_data, contained_reads, "overlap_graph.gml") from IPython.display import Image Image(filename = "overlap_graph01.png") Image(filename = "overlap_graph02.png") Image(filename = "overlap_graph03.png") arrow_defs = """ """ def svg_arrow( x1, y1, x2, y2, col, w): return """\ """ % (col, w, x1, y1, x2, y2, col) def svg_line( x1, y1, x2, y2, col, w): return """\ """ % (col, w, x1, y1, x2, y2) class Overlap_SVG_view: def __init__(self): self.header = """ """ self.footer = """""" self.elements = [] self._svg_data = None def add_element(self, elm): self.elements.append(elm) def _construct_SVG(self): SVG_str = [] SVG_str.append( self.header ) for elm in self.elements: SVG_str.append( elm ) SVG_str.append( self.footer ) self._svg_data = "\n".join(SVG_str) self._svg_data = self._svg_data.decode('utf-8') def output(self, filename): if self._svg_data is None: self._construct_SVG() with open(filename, "w") as out_f: print >> out_f, self._svg_data def _repr_svg_(self): if self._svg_data is None: self._construct_SVG() return self._svg_data @property def svg(self): return SVG(self._repr_svg_()) from IPython.display import SVG import os def generate_overlap_view(overlap_data, q_name): containment_tolerance = 10 hits = overlap_data[q_name].hits SV = Overlap_SVG_view() SV.add_element(arrow_defs) x_offset = 10000 x_scaling = 0.02 x1 = 0 x2 = overlap_data[q_name].length x1 += x_offset x2 += x_offset x1 *= x_scaling x2 *= x_scaling y = 200 SV.add_element(svg_arrow(x1, y, x2, y, "black", 4)) hits.sort(key = lambda k: (k[3][1],k[3][3],k[3][2])) for h in hits: t_name, t_score, t_idt = h[0], h[1], h[2] if t_name in contained_reads: continue q_strand, q_start, q_end, q_len = h[3] t_strand, t_start, t_end, t_len = h[4] y -=10 x1 = q_start x2 = q_end col = None if q_start < containment_tolerance and t_len - t_end < containment_tolerance: x1 = -t_start elif q_len - q_end < containment_tolerance and t_start < containment_tolerance: x2 = q_end + (t_len - t_end) else: col = "red" if t_strand == 1: x1, x2 = x2, x1 print h if t_strand == 0 and col == None: col = "green" elif col == None: col = "blue" x1, x2 = x2, x1 #print h #print min(x1, x2), max(x1, x2), abs(x1-x2) x1 += x_offset x2 += x_offset x1 *= x_scaling x2 *= x_scaling SV.add_element(svg_arrow(x1, y, x2, y, col, 3)) return SV q_name = overlap_data.keys()[0] OSV = generate_overlap_view(overlap_data, q_name) display(OSV.svg) def filter_overlap_hit(overlap_hit): containment_tolerance = 50 q_strand, q_start, q_end, q_len = overlap_hit[3] t_strand, t_start, t_end, t_len = overlap_hit[4] if q_len - (q_end - q_start) < containment_tolerance and t_len - (t_end - t_start) / t_len < containment_tolerance: return False elif overlap_hit[0] in contained_reads: return False else: return True def get_best_overlap_graph(overlap_data): best_overlap_graph = {} containment_tolerance = 50 for q_name in overlap_data: if q_name in contained_reads: continue if q_name not in best_overlap_graph: best_overlap_graph[q_name] = {"5p":None, "3p":None} targets = overlap_data[ q_name ].hits targets_5prime = [ h for h in targets if h[3][1] < containment_tolerance and filter_overlap_hit(h)] targets_3prime = [ h for h in targets if h[4][1] < containment_tolerance and filter_overlap_hit(h)] targets_5prime.sort(key = lambda k:k[1]) targets_3prime.sort(key = lambda k:k[1]) if len(targets_5prime) > 0: best_overlap_graph[q_name]["5p"] = targets_5prime[0] t_name = targets_5prime[0][0] if len(targets_3prime) > 0: best_overlap_graph[q_name]["3p"] = targets_3prime[0] t_name = targets_3prime[0][0] return best_overlap_graph best_overlap_graph = get_best_overlap_graph(overlap_data) q_name = best_overlap_graph.keys()[0] print q_name, "has best partners:", best_overlap_graph[q_name] def proper_overlap_hit(overlap_hit): containment_tolerance = 50 q_strand, q_start, q_end, q_len = overlap_hit[3] t_strand, t_start, t_end, t_len = overlap_hit[4] if q_start < containment_tolerance: if t_len - t_end > containment_tolerance: return False else: return True if t_start < containment_tolerance: if q_len - q_end > containment_tolerance: return False else: return True def find_path( q_name_end, frag_used = set() ): reverse_end = {"3p":"5p", "5p":"3p"} path = [] path_q_name = set() q_name, end = q_name_end if end == "5p": reversing_path = True else: reversing_path = False while 1: if q_name in frag_used: if reversing_path: path.reverse() return path, "frag_used" path.append( (q_name, end) ) path_q_name.add(q_name) if q_name not in best_overlap_graph: if reversing_path: path.reverse() return path, "terminate_1" next_hit = best_overlap_graph[q_name][end] #print next_hit if next_hit == None: if reversing_path: path.reverse() return path, "terminate_2" if next_hit[0] in best_overlap_graph: #if not mutual good hits, break the path # Using mutual best hit might be to strigent, # bh = [] #if best_overlap_graph[next_hit[0]]["5p"]: # bh.append( best_overlap_graph[next_hit[0]]["5p"][0] ) #if best_overlap_graph[next_hit[0]]["3p"]: # bh.append( best_overlap_graph[next_hit[0]]["3p"][0] ) bh = [h[0] for h in overlap_data[next_hit[0]].hits if proper_overlap_hit(h)] if q_name not in bh: if reversing_path: path.reverse() return path, "branch" q_name = next_hit[0] if q_name in path_q_name: if reversing_path: path.reverse() return path, "circle" if next_hit[4][0] == 1: #reversed strand end = reverse_end[end] len_qname = [ (overlap_data[x].length, x) for x in best_overlap_graph.keys() ] len_qname.sort() q_name = len_qname[-1][1] #q_name = "0xa4919d66_0061381" print "The longest fragment is", q_name path_5p, s_5p = find_path( (q_name, "5p") ) path_3p, s_3p = find_path( (q_name, "3p") ) print "The number of the fragment of the 5' path is", len(path_5p) print "The number of the fragment of the 3' path is", len(path_3p) print "The total number of framgent of both 3' and 5' path is %d." % (len(path_3p)+len(path_5p)-1) print "%d is greater than the total number of fragments %d in the best_overlap_graph." % (len(path_3p)+len(path_5p)-1, len(best_overlap_graph)) print "The begin of the 5'-path is", path_5p[0] print "The end of the 3'-path is", path_3p[-1] seq_db = {} with open("pre_assembled_reads.fa") as seq_file: for l in seq_file: l = l.strip() if l[0] == ">": name = l[1:] continue else: seq_db[name] = l def rc_seq(seq): rev_map = dict(zip("acgtACGTNn-","tgcaTGCANn-")) return "".join([rev_map[c] for c in seq[::-1]]) def layout_path(full_path, frag_used, out_fn, tig_name): if len(full_path) == 0 or full_path[0][0] in frag_used: return None if len(full_path) == 1: with open("singleton_"+out_fn, "w") as out_seq_file: seq = seq_db[full_path[0][0]] print >>out_seq_file, ">%s" % ( "singleton_" + tig_name ) print >>out_seq_file, seq frag_used.add(full_path[0][0]) return None first = full_path[0] offset = 0 current_orientation = first[1] revserse_orientation = {"5p":"3p", "3p":"5p"} tig_seq = [] frag_in_layout = set() frag_in_layout.add(first[0]) for second in full_path[1:]: overlap_hit = overlap_data[first[0]][second[0]] q_strand, q_start, q_end, q_len = overlap_hit[3] t_strand, t_start, t_end, t_len = overlap_hit[4] #print overlap_hit, offset, current_orientation seq = seq_db[first[0]] if current_orientation == "3p": seq = rc_seq(seq) del tig_seq[offset:-1] tig_seq.extend(seq) if current_orientation == "5p": offset += q_start elif current_orientation == "3p": offset += q_len - q_end if t_strand == 1: current_orientation = revserse_orientation[current_orientation] frag_in_layout.add(first[0]) first = second if second[0] in frag_in_layout: break if second[0] in frag_used: break seq = seq_db[first[0]] if current_orientation == "3p": seq = rc_seq(seq) del tig_seq[offset:-1] tig_seq.extend(seq) frag_in_layout.add(first[0]) frag_used.update( frag_in_layout ) tig_seq = tig_seq[0:offset+len(seq)] tig_seq = "".join(tig_seq) with open(out_fn, "w") as out_seq_file: print >>out_seq_file, ">%s" % tig_name print >>out_seq_file, tig_seq frag_used = set() all_best_overlap_frags = set(best_overlap_graph.keys()) unused_frag = all_best_overlap_frags - frag_used i = 0 while len(unused_frag) != 0: len_qname = [ (overlap_data[x].length, x) for x in unused_frag ] len_qname.sort() q_name = len_qname[-1][1] print "iteration: ",i print "frag is used?", q_name in frag_used if q_name in frag_used: continue path_5p, s_5p = find_path( (q_name, "5p"), frag_used ) path_3p, s_3p = find_path( (q_name, "3p"), frag_used ) print "seeding frag:", q_name, "Length:", overlap_data[q_name].length print "number of unused frag:", len(unused_frag), "total overlapped frag:", len(path_3p)+len(path_5p) print len(path_5p), s_5p, len(path_3p), s_3p print "--------------------------" #print path_5p[0], path_3p[-1] if len(path_5p) + len(path_3p) == 0: out_fn = "tig_%05d.fa" % i tig_name ="tig%05d" % i with open("singleton_"+out_fn, "w") as out_seq_file: seq = seq_db[q_name] print >>out_seq_file, ">%s" % ( "singleton_" + tig_name ) print >>out_seq_file, seq frag_used.add(q_name) unused_frag = all_best_overlap_frags - frag_used i += 1 continue if len(path_5p) > 0: #assert path_5p[-1] == path_3p[0] full_path = path_5p + path_3p[1:] else: full_path = path_3p layout_path(full_path, frag_used, "tig_%05d.fa" % i, "tig%05d" % i) unused_frag = all_best_overlap_frags - frag_used i += 1 !cat singleton_tig_000* > singletons.fa !blasr singletons.fa ecoli_k12_MG1655.fasta -m 4 -noSplitSubreads -nCandidates 20 -bestn 1 | awk '{print $1" "$2" "$4" "$6" "$7" "$8" "$8-($7-$6)}' Image(filename="tig_align.jpeg") #!dnadiff ecoli_k12_MG1655.fasta tig_00000.fa -p diff_tig0 !echo the number of SNPs is `cat diff_tig0.snps | wc | awk '{print $1}'` #dnadiff ecoli_k12_MG1655.fasta output.fasta -p diff_quiver_tig0 !echo the number of SNPs is `cat diff_quiver_tig0.snps | wc | awk '{print $1}'` from math import log print "The phred scale QV of the assembly is about %0.1f" % (-10*log(26.0/4639675)/log(10)) print "The concordnace is about %0.5f%%" % (100*(1-26.0/4639675)) !time python Write_An_Assembler.py > /dev/null