In [1]:
'''
import requests as req

def get_plausible_url(domain):
    yield 'https://www.{}'.format(domain)
    yield 'https://{}'.format(domain)
    yield 'http://{}'.format(domain)
    yield 'http://www.{}'.format(domain)

with open('top10000.txt') as alexa_list:
    index = 1
    for website in alexa_list:
        website = website.strip()

        for url in get_plausible_url(website):
            robotstxt_url = '{}/robots.txt'.format(url)
            print('[{}] downloading {}'.format(index, robotstxt_url))
            try:
                response = req.get(robotstxt_url, timeout=5)
            except:
                print('[{}] download failed for {}'.format(index, robotstxt_url))
                continue
            else:
                with open('robotstxt_new/{}'.format(website), 'wb') as f:
                    f.write(response.content)
                print('[{}] written {}'.format(index, robotstxt_url))
                break

        index += 1
'''
Out[1]:
"\nimport requests as req\n\ndef get_plausible_url(domain):\n    yield 'https://www.{}'.format(domain)\n    yield 'https://{}'.format(domain)\n    yield 'http://{}'.format(domain)\n    yield 'http://www.{}'.format(domain)\n\nwith open('top10000.txt') as alexa_list:\n    index = 1\n    for website in alexa_list:\n        website = website.strip()\n\n        for url in get_plausible_url(website):\n            robotstxt_url = '{}/robots.txt'.format(url)\n            print('[{}] downloading {}'.format(index, robotstxt_url))\n            try:\n                response = req.get(robotstxt_url, timeout=5)\n            except:\n                print('[{}] download failed for {}'.format(index, robotstxt_url))\n                continue\n            else:\n                with open('robotstxt_new/{}'.format(website), 'wb') as f:\n                    f.write(response.content)\n                print('[{}] written {}'.format(index, robotstxt_url))\n                break\n\n        index += 1\n"
In [2]:
from os import listdir
from os.path import isfile, join
import pandas as pd
from termcolor import colored
import sys

PATH_TO_DIR = './robotstxt'
files = (f for f in listdir(PATH_TO_DIR) if isfile(join(PATH_TO_DIR, f)))

user_agents = set()
directive_count = {}
count_per_website = {}

def parse_robotstxt(content):
    directives_on_this_website = set()
    
    lines = content.split('\n')
    for line in lines:
        hash_pos = line.find('#')
        if hash_pos != -1:
            line = line[0: hash_pos].strip()

        line = line.strip()
        if not line:
            continue

        line = line.strip()
        field, value = line.split(':', 1)
        field = field.strip().lower()
        value = value.strip()

        if field not in directive_count:
            directive_count[field] = 0
        directive_count[field] += 1
        
        if field not in directives_on_this_website:
            directives_on_this_website.add(field)
            if field not in count_per_website:
                count_per_website[field] = 0
            count_per_website[field] += 1
        
        if field == 'user-agent':
            user_agents.add(value)

        
website_count = 0
for filename in files:
    with open('robotstxt/{}'.format(filename), 'r') as f:
        website_count += 1
        try:
            content = f.read()
            parse_robotstxt(content)
        except:
            print("error occured while parsing {}".format(filename), file=sys.stderr)
print("No of websites : {}".format(website_count))
No of websites : 965
error occured while parsing beeg.com
error occured while parsing drive2.ru
error occured while parsing media.tumblr.com
error occured while parsing epochtimes.com
error occured while parsing codepen.io
error occured while parsing istockphoto.com
error occured while parsing amazonaws.com
error occured while parsing myway.com
error occured while parsing thewhizmarketing.com
error occured while parsing bancodevenezuela.com
error occured while parsing billdesk.com
error occured while parsing ozon.ru
error occured while parsing banesconline.com
error occured while parsing leboncoin.fr
error occured while parsing sportzbonanza.com
error occured while parsing tabelog.com
error occured while parsing ltn.com.tw
error occured while parsing taleo.net
error occured while parsing bet9ja.com
error occured while parsing aparat.com
error occured while parsing yy.com
error occured while parsing alipay.com
error occured while parsing jqw.com
error occured while parsing jd.hk
error occured while parsing t.me
error occured while parsing varzesh3.com
error occured while parsing wiktionary.org
error occured while parsing farsnews.com
error occured while parsing citi.com
error occured while parsing 126.com
error occured while parsing nga.cn
error occured while parsing justdial.com
error occured while parsing lordfilms.tv
error occured while parsing kissanime.ru
error occured while parsing onlinesbi.com
error occured while parsing telegram.org
error occured while parsing xinhuanet.com
error occured while parsing huffpost.com
error occured while parsing jianshu.com
error occured while parsing kissasian.sh
error occured while parsing incometaxindiaefiling.gov.in
error occured while parsing qualtrics.com
error occured while parsing gamespot.com
error occured while parsing zcool.com.cn
error occured while parsing epfindia.gov.in
error occured while parsing macys.com
error occured while parsing notifications.website
error occured while parsing force.com
error occured while parsing irctc.co.in
error occured while parsing nintendo.com
error occured while parsing investing.com
error occured while parsing exhentai.org
error occured while parsing tencent.com
error occured while parsing uptobox.com
error occured while parsing ptt.cc
error occured while parsing rpgmasterleague.com
error occured while parsing grammarly.com
error occured while parsing acs.org
error occured while parsing 178.com
error occured while parsing w3school.com.cn
error occured while parsing storiespace.com
error occured while parsing delta.com
error occured while parsing eastday.com
error occured while parsing redfin.com
error occured while parsing godaddy.com
error occured while parsing office365.com
error occured while parsing youdao.com
error occured while parsing animeflv.net
error occured while parsing 360.com
error occured while parsing uidai.gov.in
error occured while parsing syosetu.com
error occured while parsing dangdang.com
error occured while parsing herokuapp.com
error occured while parsing naukri.com
error occured while parsing cnzz.com
error occured while parsing yahoo.co.jp
error occured while parsing sciencedirect.com
error occured while parsing mobile01.com
error occured while parsing caixa.gov.br
error occured while parsing afreecatv.com
error occured while parsing sina.com.cn
error occured while parsing binance.com
error occured while parsing wikimedia.org
error occured while parsing wattpad.com
error occured while parsing abola.pt
error occured while parsing prnt.sc
error occured while parsing live.com
error occured while parsing gmarket.co.kr
error occured while parsing notify-service.com
error occured while parsing rednet.cn
error occured while parsing line.me
error occured while parsing gamersky.com
error occured while parsing mercari.com
error occured while parsing azure.com
error occured while parsing heroesofrpg.com
error occured while parsing zhaopin.com
error occured while parsing yespornplease.com
error occured while parsing akoam.net
error occured while parsing zhibo8.cc
error occured while parsing intoday.in
error occured while parsing jb51.net
error occured while parsing naver.jp
error occured while parsing bestbuy.com
error occured while parsing blog.me
error occured while parsing getawesome1.com
error occured while parsing bankmellat.ir
error occured while parsing crptgate.com
error occured while parsing panda.tv
error occured while parsing dspmulti.com
error occured while parsing myshopify.com
error occured while parsing hatenablog.com
error occured while parsing fc2.com
error occured while parsing japanpost.jp
error occured while parsing patria.org.ve
error occured while parsing joins.com
error occured while parsing jooble.org
error occured while parsing poe.trade
error occured while parsing asos.com
error occured while parsing canada.ca
error occured while parsing brilio.net
error occured while parsing drudgereport.com
error occured while parsing aliyun.com
error occured while parsing myanmarload.com
error occured while parsing book118.com
error occured while parsing wikipedia.org
error occured while parsing pchome.com.tw
error occured while parsing mayoclinic.org
error occured while parsing caijing.com.cn
error occured while parsing hh.ru
error occured while parsing siteadvisor.com
error occured while parsing accuweather.com
error occured while parsing hp.com
error occured while parsing office.com
In [3]:
print(pd.DataFrame(user_agents, columns=['User Agents (Distinct)']).to_string())
                                User Agents (Distinct)
0                           sitecheck.internetseer.com
1                                 Bookmark search tool
2                                      Googlebot-Image
3                                             Telesoft
4                                               libwww
5                                          gsa-crawler
6                                          OmtrBot/1.0
7                               Xenu Link Sleuth/1.3.8
8                                            WebBandit
9                                          TelegramBot
10                                              proxem
11                                         BingPreview
12                                         archive.org
13                                   Baiduspider-video
14                                            Teleport
15                                  NetResearchServer*
16                                         AmiSoftware
17                                       BunnySlippers
18                                                Pipl
19                                             mozDex*
20                                AdsBot-Google-Mobile
21                                              wotbox
22                                                 008
23                              aibang-bot Disallow: /
24                                           grapeshot
25                                            Niki-Bot
26                                            ScoutJet
27                          www.aibang.com Disallow: /
28                                     YaDirectFetcher
29                                              coccoc
30                                           WeSEE_Bot
31                                              Yandex
32                                           NICErsPRO
33                                               linko
34                                            naverbot
35                                     gsa-crawler-www
36                                          Riddlerbot
37                                            Naverbot
38                                                FAST
39                                       msnbot-mobile
40                                           Flipboard
41   Mozilla/4.0 (compatible; MSIE 4.01; Windows NT...
42                                        Pinterestbot
43                                             Maxthon
44                                              Cision
45     Rome Client (http://tinyurl.com/64t5n) Ver: 0.9
46                                          fr_crawler
47                                              MIIxpc
48                                              Foobot
49                                        Bullseye/1.0
50                                            Cincobot
51                                            Digimind
52                                            infoseek
53                                      FlipboardProxy
54                                Sogou web spider/4.0
55                               CherryPickerElite/1.0
56                                        zoomRank/2.0
57                                    alexa site audit
58                                 facebookexternalhit
59                                  Superfeedr bot/2.0
60                                          LinkWalker
61                                     Yahoo-MMCrawler
62                                Sogou web spider/3.0
63                                         LinkedInBot
64                                            JennyBot
65                                                 vsw
66                                          Bitvorebot
67                                               Sogou
68                                            YodaoBot
69                                                Yeti
70                                     WebAlta Crawler
71                                           auramundi
72                                        StackRambler
73                                      googlebot-news
74                                         DuckDuckbot
75                                             Gigabot
76   Mozilla/5.0 (compatible; Sosospider/2.0; +http...
77                                      Googlebot-News
78                                             voltron
79                                         Rome Client
80                                             Openbot
81                                        URLy Warning
82                                        GetRight/4.2
83                                           Uptimebot
84                                                BBot
85                                       Aqua_Products
86                                           looksmart
87                               Mediapartners-Google*
88                                         Baiduspider
89                                              omgili
90                               KDDI-Googlebot-Mobile
91                                          Exabot/3.0
92                                          JikeSpider
93                                           360spider
94                                      googlebot_news
95                                           flipboard
96                                              Feedly
97                                     adequat-systems
98                                             sosobot
99                           Screaming Frog SEO Spider
100                                             msrbot
101                                        NPBot-1/2.0
102                                             WikiDo
103                     omgili/0.5 +https://omgili.com
104                                            HTTrack
105                                MS Search 4.0 Robot
106  Baiduspider+(+http://www.baidu.com/search/spid...
107                                    BackDoorBot/1.0
108                                        HTTrack 3.0
109                                            NetAnts
110                                     FunWebProducts
111                                             BUbiNG
112                                            Rambler
113                              Clickagy Intelligence
114                                           test-url
115                                            bingbot
116                                       Baiduspider+
117                                      Sogou spider2
118  Mozilla/5.0 (compatible; heritrix/3.2.0 +http:...
119                                              Wget*
120                                            httplib
121                                       HaosouSpider
122                                          omgilibot
123                                              psbot
124                                           Tailrank
125     Mozilla/4.0 (compatible; BullsEye; Windows 95)
126                                           proximic
127  Mozilla/4.0 (compatible; MSIE 4.01; Windows NT...
128            CCBot/2.0 (http://commoncrawl.org/faq/)
129                                    YandexOntoDBAPI
130                                MS Search 6.0 Robot
131                                          YandexBot
132                                         Genieo/1.0
133                                          Mata Hari
134                  Zeus 32297 Webster Pro V2.9 Win32
135                                         Superfeedr
136                                       TurnitinBot*
137                                            adequat
138                                    yahoo-mmcrawler
139                                         BrandONbot
140                                            Bingbot
141                              aibangbot Disallow: /
142                                        EasouSpider
143                               netEstate NE Crawler
144                                    YandexSitelinks
145                                            LexiBot
146                                         Sogou blog
147                                             Ocelli
148                                  Flaming AttackBot
149                                        lwp-trivial
150                                             ZyBORG
151                                        Yisouspider
152   Crescent Internet ToolPak HTTP OLE Control v.1.0
153                                    TurnitinBot/1.5
154                                              NPBot
155                          trendkite-akashic-crawler
156                                          YoudaoBot
157                                           Meltawer
158                  VCI WebViewer VCI WebViewer Win32
159                                             bender
160  Sogou Pic Spider/3.0(+http://www.sogou.com/doc...
161                                  ichiro/mobile goo
162                                        LNSpiderguy
163                                      NimbleCrawler
164                                          CNCDialer
165                                             Botify
166                                           adidxbot
167                                             Exabot
168                                             Youmag
169                             koubei.com Disallow: /
170                                         Powermarks
171                                     URL_Spider_Pro
172                                      MnoGoSearch/*
173  Mozilla/5.0 (compatible; bnf.fr_bot; +http://w...
174                                             Speedy
175                                   lwp-trivial/1.34
176                                       YandexImages
177                           WebmasterWorld Extractor
178                                   discoverybot/2.0
179                                    WWW-Collector-E
180                                     ConveraCrawler
181                                           WebSnake
182                            google-hoteladsverifier
183                                      Snarfer/1.0.2
184                                            b2w/0.1
185                                            spotter
186                                        vecteurplus
187                                   ProPowerBot/2.14
188                                              Nutch
189                                             Daumoa
190                  Microsoft URL Control - 6.00.8169
191                                            sistrix
192                                              5erue
193                                   googlebot-mobile
194                                   Googlebot-Mobile
195                                            STC-bot
196                                          Clickagy*
197            BlogSearch/2 +http://www.icerocket.com/
198                                     EmailCollector
199                                      ToutiaoSpider
200                 Acunetix Web Vulnerability Scanner
201                                             Bender
202                                             Augure
203                                            moatbot
204                                        EmailSiphon
205                                         humanlinks
206                                             PerMan
207                                 Baiduspider-mobile
208                                         Archive-It
209                                  Sogou News Spider
210                                    ia_archiver/1.6
211                                       ExtractorPro
212                                            BotALot
213                            Xenu's Link Sleuth 1.1c
214                                     A6-Indexer/1.0
215                                               Wget
216                                             exabot
217                                              Teoma
218                                          EmailWolf
219                                          DeepCrawl
220                                               Gort
221                                            Uniscan
222                                           Moreover
223                                        URL Control
224                                            MauiBot
225                                   SearchmetricsBot
226                                      PortalBSpider
227                                         Sitereport
228                                        baiduspider
229                                           TheNomad
230                                      ChinasoSpider
231                                          KSCrawler
232                                         NetinfoBot
233                                      FairAd Client
234                                             DotBot
235                                       Yahoo! Slurp
236                                         DISCo Pump
237                                         SputnikBot
238                                   LinkextractorPro
239                                          SocSciBot
240                                    ArchitextSpider
241                                          AhrefsBot
242                                         WebSearch*
243                                       TightTwatBot
244                                 aibang Disallow: /
245                                        TeleportPro
246  Baiduspider/2.0;+http://www.baidu.com/search/s...
247                                              Slurp
248                                         Orthogaffe
249                           koubeispider Disallow: /
250                                             Ezooms
251                                          Atomz/1.0
252                                   The Intraformant
253                                     Download Ninja
254                                            yacybot
255                                             Xagool
256                                            hloader
257                                        YisouSpider
258                                               Zite
259                                        CazoodleBot
260                                     WochachaSpider
261                          AdsBot-Google-Mobile-Apps
262                                        Szukacz/1.4
263                                Oracle Ultra Search
264                                            ZyBorg*
265                                             Pu_iN*
266                                            trendeo
267                                            Spinn3r
268                                         AhrefsBots
269  Mozilla/5.0(compatible; Baiduspider/2.0; +http...
270                                         Telefonica
271  Baiduspider-image+(+http://www.baidu.com/searc...
272                                         CompSpyBot
273                                            Zealbot
274                                   Offline Explorer
275                                        grub-client
276                                               grub
277                                            Facebot
278                                           Netvibes
279                       Clickagy Intelligence Bot v2
280                                     WebBandit/3.50
281                                           BLP_bbot
282                              Microsoft.URL.Control
283                                             msnbot
284                                            ADmantX
285                                        DittoSpyder
286                                          Meltwater
287                                     Go-http-client
288                                        Y!J-MBS/1.0
289                                           Alexabot
290                                         EdisterBot
291                                           discobot
292                                        linkfluence
293                                        GermCrawler
294                                                puf
295                                YandexScreenshotBot
296                                           Openfind
297                                                PHP
298                                   Baiduspider-favo
299                                         dotbot/1.0
300                                        NetMechanic
301                                         True_Robot
302                     test-url/1.0 libwww-perl/5.800
303                                             blinkx
304                             Openfind data gatherer
305                                          Corporama
306                                                VCI
307                                              PGBot
308                             WebmasterWorldForumBot
309                                   NextGenSearchBot
310                                         MIIxpc/4.2
311                                           NaverBot
312                                              Link*
313                                            Scrubby
314                                   Sogou web spider
315                                             Sindup
316                 FAST Enterprise Crawler 6 / Scirus
317                                           Swiftbot
318                                          googlebot
319                                 CherryPickerSE/1.0
320                                           twiceler
321                                         HMSE_Robot
322                                   YandexSearchShop
323                  Microsoft URL Control - 5.01.4511
324                                         OnetSzukaj
325                                           Twiceler
326                                              spbot
327                                      BaiduMobaider
328                                              Psbot
329                                         vebidoobot
330                                           alexabot
331                                         trendybuzz
332                                        FBSearchBot
333                                          360Spider
334                                 Libreprensabot/1.0
335                                              FAST*
336                                         EtaoSpider
337                                                Zao
338                                           test url
339                                         EroCrawler
340                                            Newzbin
341                                       BlowFish/1.0
342                                      searchpreview
343                                            Mail.Ru
344                                        Webster Pro
345                                            Pingdom
346                                          CCBot/2.0
347                                      SemrushBot-SA
348                                   BaiduImagespider
349                                     magpie-crawler
350                                          Synthesio
351                                   Baiduspider-cpro
352                                    Baiduspider-ads
353                                Web Image Collector
354                                            Freedom
355                                   +Baiduspider/2.0
356                                          Pinterest
357                                       Teleport Pro
358                                              CCBot
359                          Qwam content intelligence
360                     RepoMonkey Bait & Tackle/v1.01
361                                            wegobot
362                                externalfacebookhit
363                                             score3
364                                          MegaIndex
365                                           smspider
366                                              nutch
367                                            MJ12bot
368                                          GurujiBot
369                                          CheeseBot
370                                      omgilibot/0.3
371                                             uipbot
372                                          MagpieRSS
373  Mozilla/4.0 (compatible; Netcraft Web Server S...
374                                                RMA
375                                    Yahoo Pipes 1.0
376                                                  *
377                                            DotBot*
378                                           turingos
379                                        Y!J-SRD/1.0
380                                innosense/Nutch-1.0
381                                        WBSearchBot
382                                      ContextAd Bot
383                                             dotbot
384                                       GwdangSpider
385                                            Gaisbot
386                                          Robozilla
387                                  TrustpilotCrawler
388                                    YandexMobileBot
389                                          Googlebot
390                                        TurnitinBot
391                                           Seekbot*
392                                            WebAuto
393                                               wget
394                                        duckduckbot
395                                          Talkwater
396                                           Nigma.ru
397                                           Cliqzbot
398                                   blinkx_ff_spider
399                                       Iron33/1.0.2
400                                             Lizard
401                                   Baiduspider-news
402                                           Scooter*
403                                    WebCopier v3.2a
404                                Keyword Density/0.9
405                        www.integromedb.org/Crawler
406                                            5emeRue
407                                           Relcybot
408                                            gigabot
409                         cisco-googlebot-enterprise
410                                      ichiro/mobile
411                                        Snapbot/1.0
412                                     trendictionbot
413                                       BLP_bbot/0.1
414                                      berlin-fu-cow
415                                       Java Browser
416                                                DOC
417                                             larbin
418                                OrangeBot-Collector
419                             Fast corporate crawler
420                                      InfoNaviRobot
421  Mozilla/5.0 (compatible; Taboolabot/3.7; +http...
422                                    googlebot-image
423                                             coexel
424                              Microsoft URL Control
425                                           IDentity
426                                      AdsBot-Google
427                                           NerdyBot
428                                        linguatools
429                                            WebAlta
430                                            Balihoo
431                          Acunetix Security Scanner
432                                           VoilaBot
433                           aibangspider Disallow: /
434                                             Xenu's
435                                       ProWebWalker
436                                   CRAZYWEBCRAWLER*
437                                        ia_archiver
438                                        WebEnhancer
439                                         Sosospider
440                                          careerbot
441                                         WebZIP/5.0
442                                         SlySearch*
443                                        MSIECrawler
444                                 Sogou Orion spider
445                                           Knowings
446                                    Website Quester
447                                      YandexMetrika
448                                            netseer
449                                    Googlebot-image
450                                        Web-By-Mail
451                                          BecomeBot
452                               Mediapartners-Google
453                                          moget/2.1
454                 ADmantX Platform Semantic Analyzer
455                                       Browsershots
456                                              Fetch
457                                           CatchBot
458                                         BlogSearch
459                                          Charlotte
460                                      MoodleBot/1.0
461                                               yeti
462                                         Ask n read
463                                        DuckDuckBot
464                                            PiplBot
465                                      Adsbot-Google
466                                           k2spider
467                                             bhcBot
468                                  Sogou inst spider
469                             aibang.com Disallow: /
470                                           Copernic
471                                          trovitBot
472                                            leadbox
473                                            sitebot
474                                    YandexWebmaster
475                                  YandexVideoParser
476                            Google-HotelAdsVerifier
477                                            suzuran
478                                             Jetbot
479                                          deepcrawl
480                                         YandexNews
481                                         ellisphere
482                                       +Baiduspider
483                                                 *;
484                                             Solbot
485                                     CopyRightCheck
486                                        yisouspider
487                                           ShopWiki
488                                              KaBot
489                                     verticalsearch
490                                    JobdiggerSpider
491                                 magpie-crawler/1.1
492                                      Yahoo!  Slurp
493                                             mytwip
494                                   YandexMedianaBot
495                                    Zeus Link Scout
496                                      YoudaoBot/1.0
497                                              moget
498                                    SimplePie/1.1.3
499                                          WebSauger
500                             YandexAccessibilityBot
501                                            BLEXBot
502                                           rogerbot
503                                        Aboundexbot
504                                          psnsearch
505                                             WebZip
506                                       msnbot-media
507                                          psbot/0.1
508                                             SWEBot
509                             CrazyWebCrawler-Spider
510                                             WebZIP
511                                           Adidxbot
512                                                nsa
513                                             WebVac
514                                          WebReaper
515                                            spanner
516                                              Open*
517                                        LinkChecker
518                                           Applebot
519                                    MaxPointCrawler
520                             Domain Re-Animator Bot
521                                       HuihuiSpider
522                                            NewsNow
523                                           heritrix
524                                     libWeb/clsHTTP
525                                           Alexibot
526                                          WebCopier
527                                           Crescent
528                                          Owlin Bot
529                                   ParadigmCrawler*
530                                  Baiduspider-image
531                                          seznambot
532                            Radiation Retriever 1.1
533   SQUID_configured_as_described_at_/help/faq/cache
534                                    YandexDirectDyn
535           Mozilla/5.0 (compatible; Google-Podcast)
536                                           SpankBot
537                                          OrangeBot
538                                             daumoa
539                                            mention
540                              Openfind data gathere
541                                         Mister PiX
542                                       asknread.com
543  Mozilla/5.0 (compatible; JikeSpider; +http://s...
544                                       CherryPicker
545                                     WebCapture 2.0
546                                    archive.org_bot
547                              toCrawl/UrlDispatcher
548                                            SiteArc
549                                             vobsub
550                                   yahoo-blogs/v3.9
551                                         Wget/1.5.3
552                                         RepoMonkey
553                                           scoop.it
554                     Jakarta Commons-HttpClient/3.1
555                                  QueryN Metasearch
556                                             google
557                                  red-app-gsa-p-one
558                                           asterias
559                                    Baiduspider/2.0
560                                       YandexDirect
561                                         tadcrawler
562                                        SiteSnagger
563                                  Search360-Crawler
564                                     True_Robot/1.0
565                                       SiteExplorer
566                                     Callpod Keeper
567                                             cosmos
568                                             Krugle
569                                         UbiCrawler
570  Mozilla/4.0 (compatible; MSIE 6.0; Windows NT;...
571                                               Xenu
572                                             Kraken
573                                         WebZip/4.0
574                                             endeca
575                                    TerrawizBot/1.0
576                                     Twitterbot/1.0
577                                   IntuitGSACrawler
578                                        YadirectBot
579                        ia_archiver-web.archive.org
580                                              teoma
581  +Baiduspider/2.0;++http://www.baidu.com/search...
582                                           AdIdxBot
583                                         SemrushBot
584                                    Googlebot-Video
585                                       Laserlikebot
586                                        YandexVideo
587                                         SandDollar
588                                      cis455crawler
589                                      Python-urllib
590                                            kbcrawl
591                                             MSNBot
592                                           Wget/1.6
593                                     YandexCalendar
594                                       Mail.RU_Bot*
595                                      BuiltBotTough
596                     test-url/1.0 libwww-perl/5.801
597                                   OmniExplorer_Bot
598                                    Google-Sitemaps
599                                          SeznamBot
600                                           Searchie
601                              YahooSeeker/M1A1-R2D2
602                                       HaoSouSpider
603                                    WebCopier v.2.2
604                          aibang-spider Disallow: /
605                                    ChangeDetection
606                                        WebZIP/4.21
607                                CrystalSemanticsBot
608                                        Harvest/1.5
609                                               Zeus
610                                       BotRightHere
611                                          GoogleBot
612                     test-url/1.0 libwww-perl/5.803
613                                            winello
614                                         MSNPTC/1.0
615  Mozilla/4.0 (compatible; MSIE 6.0; Windows NT;...
616                                         Twitterbot
617                                            larbin*
618                                        Mail.RU_Bot
619                                        WebStripper
620    Flipboard/3.2.6 CFNetwork/711.0.6 Darwin/14.0.0
621                                    opinion-tracker
622                                            Speobot
623                                      Kenjin Spider
624                                 LinkScan/8.1a Unix
625                                        Sogouspider
626                                   Wandoujia Spider
627                                         Ezooms/1.0
In [4]:
display(pd.DataFrame(directive_count.items(), columns=['Directive', 'Count']))
Directive Count
0 user-agent 2917
1 disallow 56575
2 crawl-delay 97
3 sitemap 5210
4 allow 5426
5 clean-param 46
6 host 38
7 <!doctype html><html class="en-us no-js " lan... 1
8 noindex 502
9 <!doctype html public "-//w3c//dtd xhtml 1.0 s... 5
10 <html xmlns="http 9
11 ´╗┐user-agent 6
12 request-rate 6
13 visit-time 1
14 <html style="background 1
15 {"timestamp" 1
16 <!-- fd 1
17 <!doctype html public "-//w3c//dtd xhtml 1.0 t... 4
18 <!doctype html><html><head><title>ign error 40... 1
19 <!doctype html /><html><head><title data-react... 1
20 new date().gettime(),event 1
21 <!doctype html><html><head><title>apache tomca... 1
22 <!doctype html><html><head><meta charset="utf-... 1
23 <!doctype html><html class="no-js" lang="en" d... 1
24 @media (min-width 1
25 <!doctype html public "-//w3c//dtd html 4.01 t... 1
26 disllow 1
27 <!doctype html public "-//softquad//dtd hotmet... 1
28 <!doctype html> <html lang="en"> <head> <meta ... 1
29 <!doctype html><html lang="en"><head><meta cha... 1
30 var $default_lang="";</script><link rel="style... 1
31 <!doctype html public "-//w3c//dtd xhtml 1.0 t... 1
32 <html xmlns="https 1
In [5]:
display(pd.DataFrame(map(lambda x: [x[0], x[1], x[1]*100/website_count], count_per_website.items()), columns=['Directive', 'Number of Websites', '%']))
Directive Number of Websites %
0 user-agent 786 81.450777
1 disallow 749 77.616580
2 crawl-delay 70 7.253886
3 sitemap 436 45.181347
4 allow 348 36.062176
5 clean-param 14 1.450777
6 host 38 3.937824
7 <!doctype html><html class="en-us no-js " lan... 1 0.103627
8 noindex 25 2.590674
9 <!doctype html public "-//w3c//dtd xhtml 1.0 s... 5 0.518135
10 <html xmlns="http 9 0.932642
11 ´╗┐user-agent 6 0.621762
12 request-rate 4 0.414508
13 visit-time 1 0.103627
14 <html style="background 1 0.103627
15 {"timestamp" 1 0.103627
16 <!-- fd 1 0.103627
17 <!doctype html public "-//w3c//dtd xhtml 1.0 t... 4 0.414508
18 <!doctype html><html><head><title>ign error 40... 1 0.103627
19 <!doctype html /><html><head><title data-react... 1 0.103627
20 new date().gettime(),event 1 0.103627
21 <!doctype html><html><head><title>apache tomca... 1 0.103627
22 <!doctype html><html><head><meta charset="utf-... 1 0.103627
23 <!doctype html><html class="no-js" lang="en" d... 1 0.103627
24 @media (min-width 1 0.103627
25 <!doctype html public "-//w3c//dtd html 4.01 t... 1 0.103627
26 disllow 1 0.103627
27 <!doctype html public "-//softquad//dtd hotmet... 1 0.103627
28 <!doctype html> <html lang="en"> <head> <meta ... 1 0.103627
29 <!doctype html><html lang="en"><head><meta cha... 1 0.103627
30 var $default_lang="";</script><link rel="style... 1 0.103627
31 <!doctype html public "-//w3c//dtd xhtml 1.0 t... 1 0.103627
32 <html xmlns="https 1 0.103627
In [ ]: