with open('./host-linkage/bl-uk-host-linkage.dat', 'r') as f: for line in f: print(line) row = line.rstrip().replace('|','\t').split("\t") print(row) break import time # Open input and output files with open('./host-linkage/bl-uk-host-linkage.dat', 'r') as fin: with open('./host-linkage/linkage.log', 'w') as fout: counter = 0 for line in fin: # Reformat: new_line = to_gource(line) fout.write(new_line) fout.write('\n') # Also count: counter = counter + 1 # Report progress: if( counter%10000 == 0 ): print(counter, line, new_line, '\n') # Report outcome: print("Wrote {} lines.".format(counter)) fout.close() import time def to_gource(line): row = line.rstrip().replace('|','\t').split("\t") timestamp = int(time.mktime(time.strptime(row[0], "%Y"))) hostname = row[1] blhost = row[2] action = "A" colour = "FF0000" if( blhost.find("bl.uk") == -1 ): hostname = row[2] blhost = row[1] colour = "0000FF" #path = '/'.join(reversed(blhost.split('.'))) #path = path +'/' + '/'.join(reversed(hostname.split('.'))) path = '/'.join(reversed(hostname.split('.'))) return "{}|{}|{}|{}|{}".format(timestamp,hostname,action,path,colour) print(to_gource("1996|appserver.ed.ac.uk|portico.bl.uk 1")) import time # Takes a line from the linkage dataset and converts it into the form # (year, link_path, link_source, link_num) # Where 'actor' is the host that created the link def transform_link(line): row = line.rstrip().replace('|','\t').split("\t") year = row[0] link_source = row[1] link_target = row[2] host = row[1] blhost = row[2] link_num = row[3] if( blhost.find("bl.uk") == -1 ): host = row[2] blhost = row[1] path = '/'.join(reversed(host.split('.'))) return (year, path, link_source, link_num) # Open input and output files known = {} years = set() paths = set() counter = 0 with open('./host-linkage/york-ac-uk-linkage.tsv', 'r') as fin: for line in fin: try: # Reformat: (year, path, link_source, link_num) = transform_link(line) if( link_source.find("york.ac.uk") == -1 ): link_type = "in" else: link_type = "out" key = "{}|{}|{}".format(year, path, link_type) known[key] = link_num years.add(year) paths.add(path) except Exception as e: print(e) print(line) stop # Also count: counter = counter + 1 # Report progress: if( counter%10000 == 0 ): print(counter, line, key, '\n') # Report outcome: print("Processed {} lines.".format(counter)) def get_state(year,path): key_in = "{}|{}|{}".format(year, path, "in") key_out = "{}|{}|{}".format(year, path, "out") state = None; if( key_in in known ): state = "in" if( key_out in known ): state = "out" if( key_in in known and key_out in known ): state = "both" return state; # Now process and output in the form "1230768000|www.bank.lv|A|lv/bank/www|0000FF" : changes = 0 deletions = 0 with open('./host-linkage/york-linkage.log', 'w') as fout: # Loop over all known years and paths: for year in sorted(years): timestamp = int(time.mktime(time.strptime(year, "%Y"))) for path in paths: # Determine state for current year: current_state = get_state(year,path) previous_state = get_state(int(year)-1,path) host = '.'.join(reversed(path.split('/'))) if( current_state != None and current_state != previous_state ): changes += 1 if( previous_state == None ): action = "A" else: action = "M" # And now the in/out state: if( current_state == "in" ): colour = "FFFF00" elif( current_state == "out" ): colour = "0000FF" elif( current_state == "both" ): colour = "00FF00" agent = path fout.write("{}|{}|{}|{}/{}|{}".format(timestamp,agent,action,path,host,colour) ) fout.write('\n') if( current_state == None and previous_state != None ): # This is the case of deleted links: deletions += 1 action = "D" colour = "000000" agent = path fout.write("{}|{}|{}|{}/{}|{}".format(timestamp,agent,action,path,host,colour) ) fout.write('\n') print("Output {} changes.".format(changes)) print("Output {} deletions.".format(deletions)) fout.close()