#!/usr/bin/env python
# coding: utf-8

# In[1]:


'''
import requests as req

def get_plausible_url(domain):
    yield 'https://www.{}'.format(domain)
    yield 'https://{}'.format(domain)
    yield 'http://{}'.format(domain)
    yield 'http://www.{}'.format(domain)

with open('top10000.txt') as alexa_list:
    index = 1
    for website in alexa_list:
        website = website.strip()

        for url in get_plausible_url(website):
            robotstxt_url = '{}/robots.txt'.format(url)
            print('[{}] downloading {}'.format(index, robotstxt_url))
            try:
                response = req.get(robotstxt_url, timeout=5)
            except:
                print('[{}] download failed for {}'.format(index, robotstxt_url))
                continue
            else:
                with open('robotstxt_new/{}'.format(website), 'wb') as f:
                    f.write(response.content)
                print('[{}] written {}'.format(index, robotstxt_url))
                break

        index += 1
'''


# In[2]:


from os import listdir
from os.path import isfile, join
import pandas as pd
from termcolor import colored
import sys

PATH_TO_DIR = './robotstxt'
files = (f for f in listdir(PATH_TO_DIR) if isfile(join(PATH_TO_DIR, f)))

user_agents = set()
directive_count = {}
count_per_website = {}

def parse_robotstxt(content):
    directives_on_this_website = set()
    
    lines = content.split('\n')
    for line in lines:
        hash_pos = line.find('#')
        if hash_pos != -1:
            line = line[0: hash_pos].strip()

        line = line.strip()
        if not line:
            continue

        line = line.strip()
        field, value = line.split(':', 1)
        field = field.strip().lower()
        value = value.strip()

        if field not in directive_count:
            directive_count[field] = 0
        directive_count[field] += 1
        
        if field not in directives_on_this_website:
            directives_on_this_website.add(field)
            if field not in count_per_website:
                count_per_website[field] = 0
            count_per_website[field] += 1
        
        if field == 'user-agent':
            user_agents.add(value)

        
website_count = 0
for filename in files:
    with open('robotstxt/{}'.format(filename), 'r') as f:
        website_count += 1
        try:
            content = f.read()
            parse_robotstxt(content)
        except:
            print("error occured while parsing {}".format(filename), file=sys.stderr)
print("No of websites : {}".format(website_count))


# In[3]:


print(pd.DataFrame(user_agents, columns=['User Agents (Distinct)']).to_string())


# In[4]:


display(pd.DataFrame(directive_count.items(), columns=['Directive', 'Count']))


# In[5]:


display(pd.DataFrame(map(lambda x: [x[0], x[1], x[1]*100/website_count], count_per_website.items()), columns=['Directive', 'Number of Websites', '%']))


# In[ ]: