#!/usr/bin/env python # coding: utf-8 # In[1]: ''' import requests as req def get_plausible_url(domain): yield 'https://www.{}'.format(domain) yield 'https://{}'.format(domain) yield 'http://{}'.format(domain) yield 'http://www.{}'.format(domain) with open('top10000.txt') as alexa_list: index = 1 for website in alexa_list: website = website.strip() for url in get_plausible_url(website): robotstxt_url = '{}/robots.txt'.format(url) print('[{}] downloading {}'.format(index, robotstxt_url)) try: response = req.get(robotstxt_url, timeout=5) except: print('[{}] download failed for {}'.format(index, robotstxt_url)) continue else: with open('robotstxt_new/{}'.format(website), 'wb') as f: f.write(response.content) print('[{}] written {}'.format(index, robotstxt_url)) break index += 1 ''' # In[2]: from os import listdir from os.path import isfile, join import pandas as pd from termcolor import colored import sys PATH_TO_DIR = './robotstxt' files = (f for f in listdir(PATH_TO_DIR) if isfile(join(PATH_TO_DIR, f))) user_agents = set() directive_count = {} count_per_website = {} def parse_robotstxt(content): directives_on_this_website = set() lines = content.split('\n') for line in lines: hash_pos = line.find('#') if hash_pos != -1: line = line[0: hash_pos].strip() line = line.strip() if not line: continue line = line.strip() field, value = line.split(':', 1) field = field.strip().lower() value = value.strip() if field not in directive_count: directive_count[field] = 0 directive_count[field] += 1 if field not in directives_on_this_website: directives_on_this_website.add(field) if field not in count_per_website: count_per_website[field] = 0 count_per_website[field] += 1 if field == 'user-agent': user_agents.add(value) website_count = 0 for filename in files: with open('robotstxt/{}'.format(filename), 'r') as f: website_count += 1 try: content = f.read() parse_robotstxt(content) except: print("error occured while parsing {}".format(filename), file=sys.stderr) print("No of websites : {}".format(website_count)) # In[3]: print(pd.DataFrame(user_agents, columns=['User Agents (Distinct)']).to_string()) # In[4]: display(pd.DataFrame(directive_count.items(), columns=['Directive', 'Count'])) # In[5]: display(pd.DataFrame(map(lambda x: [x[0], x[1], x[1]*100/website_count], count_per_website.items()), columns=['Directive', 'Number of Websites', '%'])) # In[ ]: