'''
import requests as req
def get_plausible_url(domain):
yield 'https://www.{}'.format(domain)
yield 'https://{}'.format(domain)
yield 'http://{}'.format(domain)
yield 'http://www.{}'.format(domain)
with open('top10000.txt') as alexa_list:
index = 1
for website in alexa_list:
website = website.strip()
for url in get_plausible_url(website):
robotstxt_url = '{}/robots.txt'.format(url)
print('[{}] downloading {}'.format(index, robotstxt_url))
try:
response = req.get(robotstxt_url, timeout=5)
except:
print('[{}] download failed for {}'.format(index, robotstxt_url))
continue
else:
with open('robotstxt_new/{}'.format(website), 'wb') as f:
f.write(response.content)
print('[{}] written {}'.format(index, robotstxt_url))
break
index += 1
'''
"\nimport requests as req\n\ndef get_plausible_url(domain):\n yield 'https://www.{}'.format(domain)\n yield 'https://{}'.format(domain)\n yield 'http://{}'.format(domain)\n yield 'http://www.{}'.format(domain)\n\nwith open('top10000.txt') as alexa_list:\n index = 1\n for website in alexa_list:\n website = website.strip()\n\n for url in get_plausible_url(website):\n robotstxt_url = '{}/robots.txt'.format(url)\n print('[{}] downloading {}'.format(index, robotstxt_url))\n try:\n response = req.get(robotstxt_url, timeout=5)\n except:\n print('[{}] download failed for {}'.format(index, robotstxt_url))\n continue\n else:\n with open('robotstxt_new/{}'.format(website), 'wb') as f:\n f.write(response.content)\n print('[{}] written {}'.format(index, robotstxt_url))\n break\n\n index += 1\n"
from os import listdir
from os.path import isfile, join
import pandas as pd
from termcolor import colored
import sys
PATH_TO_DIR = './robotstxt'
files = (f for f in listdir(PATH_TO_DIR) if isfile(join(PATH_TO_DIR, f)))
user_agents = set()
directive_count = {}
count_per_website = {}
def parse_robotstxt(content):
directives_on_this_website = set()
lines = content.split('\n')
for line in lines:
hash_pos = line.find('#')
if hash_pos != -1:
line = line[0: hash_pos].strip()
line = line.strip()
if not line:
continue
line = line.strip()
field, value = line.split(':', 1)
field = field.strip().lower()
value = value.strip()
if field not in directive_count:
directive_count[field] = 0
directive_count[field] += 1
if field not in directives_on_this_website:
directives_on_this_website.add(field)
if field not in count_per_website:
count_per_website[field] = 0
count_per_website[field] += 1
if field == 'user-agent':
user_agents.add(value)
website_count = 0
for filename in files:
with open('robotstxt/{}'.format(filename), 'r') as f:
website_count += 1
try:
content = f.read()
parse_robotstxt(content)
except:
print("error occured while parsing {}".format(filename), file=sys.stderr)
print("No of websites : {}".format(website_count))
No of websites : 965
error occured while parsing beeg.com error occured while parsing drive2.ru error occured while parsing media.tumblr.com error occured while parsing epochtimes.com error occured while parsing codepen.io error occured while parsing istockphoto.com error occured while parsing amazonaws.com error occured while parsing myway.com error occured while parsing thewhizmarketing.com error occured while parsing bancodevenezuela.com error occured while parsing billdesk.com error occured while parsing ozon.ru error occured while parsing banesconline.com error occured while parsing leboncoin.fr error occured while parsing sportzbonanza.com error occured while parsing tabelog.com error occured while parsing ltn.com.tw error occured while parsing taleo.net error occured while parsing bet9ja.com error occured while parsing aparat.com error occured while parsing yy.com error occured while parsing alipay.com error occured while parsing jqw.com error occured while parsing jd.hk error occured while parsing t.me error occured while parsing varzesh3.com error occured while parsing wiktionary.org error occured while parsing farsnews.com error occured while parsing citi.com error occured while parsing 126.com error occured while parsing nga.cn error occured while parsing justdial.com error occured while parsing lordfilms.tv error occured while parsing kissanime.ru error occured while parsing onlinesbi.com error occured while parsing telegram.org error occured while parsing xinhuanet.com error occured while parsing huffpost.com error occured while parsing jianshu.com error occured while parsing kissasian.sh error occured while parsing incometaxindiaefiling.gov.in error occured while parsing qualtrics.com error occured while parsing gamespot.com error occured while parsing zcool.com.cn error occured while parsing epfindia.gov.in error occured while parsing macys.com error occured while parsing notifications.website error occured while parsing force.com error occured while parsing irctc.co.in error occured while parsing nintendo.com error occured while parsing investing.com error occured while parsing exhentai.org error occured while parsing tencent.com error occured while parsing uptobox.com error occured while parsing ptt.cc error occured while parsing rpgmasterleague.com error occured while parsing grammarly.com error occured while parsing acs.org error occured while parsing 178.com error occured while parsing w3school.com.cn error occured while parsing storiespace.com error occured while parsing delta.com error occured while parsing eastday.com error occured while parsing redfin.com error occured while parsing godaddy.com error occured while parsing office365.com error occured while parsing youdao.com error occured while parsing animeflv.net error occured while parsing 360.com error occured while parsing uidai.gov.in error occured while parsing syosetu.com error occured while parsing dangdang.com error occured while parsing herokuapp.com error occured while parsing naukri.com error occured while parsing cnzz.com error occured while parsing yahoo.co.jp error occured while parsing sciencedirect.com error occured while parsing mobile01.com error occured while parsing caixa.gov.br error occured while parsing afreecatv.com error occured while parsing sina.com.cn error occured while parsing binance.com error occured while parsing wikimedia.org error occured while parsing wattpad.com error occured while parsing abola.pt error occured while parsing prnt.sc error occured while parsing live.com error occured while parsing gmarket.co.kr error occured while parsing notify-service.com error occured while parsing rednet.cn error occured while parsing line.me error occured while parsing gamersky.com error occured while parsing mercari.com error occured while parsing azure.com error occured while parsing heroesofrpg.com error occured while parsing zhaopin.com error occured while parsing yespornplease.com error occured while parsing akoam.net error occured while parsing zhibo8.cc error occured while parsing intoday.in error occured while parsing jb51.net error occured while parsing naver.jp error occured while parsing bestbuy.com error occured while parsing blog.me error occured while parsing getawesome1.com error occured while parsing bankmellat.ir error occured while parsing crptgate.com error occured while parsing panda.tv error occured while parsing dspmulti.com error occured while parsing myshopify.com error occured while parsing hatenablog.com error occured while parsing fc2.com error occured while parsing japanpost.jp error occured while parsing patria.org.ve error occured while parsing joins.com error occured while parsing jooble.org error occured while parsing poe.trade error occured while parsing asos.com error occured while parsing canada.ca error occured while parsing brilio.net error occured while parsing drudgereport.com error occured while parsing aliyun.com error occured while parsing myanmarload.com error occured while parsing book118.com error occured while parsing wikipedia.org error occured while parsing pchome.com.tw error occured while parsing mayoclinic.org error occured while parsing caijing.com.cn error occured while parsing hh.ru error occured while parsing siteadvisor.com error occured while parsing accuweather.com error occured while parsing hp.com error occured while parsing office.com
print(pd.DataFrame(user_agents, columns=['User Agents (Distinct)']).to_string())
User Agents (Distinct) 0 sitecheck.internetseer.com 1 Bookmark search tool 2 Googlebot-Image 3 Telesoft 4 libwww 5 gsa-crawler 6 OmtrBot/1.0 7 Xenu Link Sleuth/1.3.8 8 WebBandit 9 TelegramBot 10 proxem 11 BingPreview 12 archive.org 13 Baiduspider-video 14 Teleport 15 NetResearchServer* 16 AmiSoftware 17 BunnySlippers 18 Pipl 19 mozDex* 20 AdsBot-Google-Mobile 21 wotbox 22 008 23 aibang-bot Disallow: / 24 grapeshot 25 Niki-Bot 26 ScoutJet 27 www.aibang.com Disallow: / 28 YaDirectFetcher 29 coccoc 30 WeSEE_Bot 31 Yandex 32 NICErsPRO 33 linko 34 naverbot 35 gsa-crawler-www 36 Riddlerbot 37 Naverbot 38 FAST 39 msnbot-mobile 40 Flipboard 41 Mozilla/4.0 (compatible; MSIE 4.01; Windows NT... 42 Pinterestbot 43 Maxthon 44 Cision 45 Rome Client (http://tinyurl.com/64t5n) Ver: 0.9 46 fr_crawler 47 MIIxpc 48 Foobot 49 Bullseye/1.0 50 Cincobot 51 Digimind 52 infoseek 53 FlipboardProxy 54 Sogou web spider/4.0 55 CherryPickerElite/1.0 56 zoomRank/2.0 57 alexa site audit 58 facebookexternalhit 59 Superfeedr bot/2.0 60 LinkWalker 61 Yahoo-MMCrawler 62 Sogou web spider/3.0 63 LinkedInBot 64 JennyBot 65 vsw 66 Bitvorebot 67 Sogou 68 YodaoBot 69 Yeti 70 WebAlta Crawler 71 auramundi 72 StackRambler 73 googlebot-news 74 DuckDuckbot 75 Gigabot 76 Mozilla/5.0 (compatible; Sosospider/2.0; +http... 77 Googlebot-News 78 voltron 79 Rome Client 80 Openbot 81 URLy Warning 82 GetRight/4.2 83 Uptimebot 84 BBot 85 Aqua_Products 86 looksmart 87 Mediapartners-Google* 88 Baiduspider 89 omgili 90 KDDI-Googlebot-Mobile 91 Exabot/3.0 92 JikeSpider 93 360spider 94 googlebot_news 95 flipboard 96 Feedly 97 adequat-systems 98 sosobot 99 Screaming Frog SEO Spider 100 msrbot 101 NPBot-1/2.0 102 WikiDo 103 omgili/0.5 +https://omgili.com 104 HTTrack 105 MS Search 4.0 Robot 106 Baiduspider+(+http://www.baidu.com/search/spid... 107 BackDoorBot/1.0 108 HTTrack 3.0 109 NetAnts 110 FunWebProducts 111 BUbiNG 112 Rambler 113 Clickagy Intelligence 114 test-url 115 bingbot 116 Baiduspider+ 117 Sogou spider2 118 Mozilla/5.0 (compatible; heritrix/3.2.0 +http:... 119 Wget* 120 httplib 121 HaosouSpider 122 omgilibot 123 psbot 124 Tailrank 125 Mozilla/4.0 (compatible; BullsEye; Windows 95) 126 proximic 127 Mozilla/4.0 (compatible; MSIE 4.01; Windows NT... 128 CCBot/2.0 (http://commoncrawl.org/faq/) 129 YandexOntoDBAPI 130 MS Search 6.0 Robot 131 YandexBot 132 Genieo/1.0 133 Mata Hari 134 Zeus 32297 Webster Pro V2.9 Win32 135 Superfeedr 136 TurnitinBot* 137 adequat 138 yahoo-mmcrawler 139 BrandONbot 140 Bingbot 141 aibangbot Disallow: / 142 EasouSpider 143 netEstate NE Crawler 144 YandexSitelinks 145 LexiBot 146 Sogou blog 147 Ocelli 148 Flaming AttackBot 149 lwp-trivial 150 ZyBORG 151 Yisouspider 152 Crescent Internet ToolPak HTTP OLE Control v.1.0 153 TurnitinBot/1.5 154 NPBot 155 trendkite-akashic-crawler 156 YoudaoBot 157 Meltawer 158 VCI WebViewer VCI WebViewer Win32 159 bender 160 Sogou Pic Spider/3.0(+http://www.sogou.com/doc... 161 ichiro/mobile goo 162 LNSpiderguy 163 NimbleCrawler 164 CNCDialer 165 Botify 166 adidxbot 167 Exabot 168 Youmag 169 koubei.com Disallow: / 170 Powermarks 171 URL_Spider_Pro 172 MnoGoSearch/* 173 Mozilla/5.0 (compatible; bnf.fr_bot; +http://w... 174 Speedy 175 lwp-trivial/1.34 176 YandexImages 177 WebmasterWorld Extractor 178 discoverybot/2.0 179 WWW-Collector-E 180 ConveraCrawler 181 WebSnake 182 google-hoteladsverifier 183 Snarfer/1.0.2 184 b2w/0.1 185 spotter 186 vecteurplus 187 ProPowerBot/2.14 188 Nutch 189 Daumoa 190 Microsoft URL Control - 6.00.8169 191 sistrix 192 5erue 193 googlebot-mobile 194 Googlebot-Mobile 195 STC-bot 196 Clickagy* 197 BlogSearch/2 +http://www.icerocket.com/ 198 EmailCollector 199 ToutiaoSpider 200 Acunetix Web Vulnerability Scanner 201 Bender 202 Augure 203 moatbot 204 EmailSiphon 205 humanlinks 206 PerMan 207 Baiduspider-mobile 208 Archive-It 209 Sogou News Spider 210 ia_archiver/1.6 211 ExtractorPro 212 BotALot 213 Xenu's Link Sleuth 1.1c 214 A6-Indexer/1.0 215 Wget 216 exabot 217 Teoma 218 EmailWolf 219 DeepCrawl 220 Gort 221 Uniscan 222 Moreover 223 URL Control 224 MauiBot 225 SearchmetricsBot 226 PortalBSpider 227 Sitereport 228 baiduspider 229 TheNomad 230 ChinasoSpider 231 KSCrawler 232 NetinfoBot 233 FairAd Client 234 DotBot 235 Yahoo! Slurp 236 DISCo Pump 237 SputnikBot 238 LinkextractorPro 239 SocSciBot 240 ArchitextSpider 241 AhrefsBot 242 WebSearch* 243 TightTwatBot 244 aibang Disallow: / 245 TeleportPro 246 Baiduspider/2.0;+http://www.baidu.com/search/s... 247 Slurp 248 Orthogaffe 249 koubeispider Disallow: / 250 Ezooms 251 Atomz/1.0 252 The Intraformant 253 Download Ninja 254 yacybot 255 Xagool 256 hloader 257 YisouSpider 258 Zite 259 CazoodleBot 260 WochachaSpider 261 AdsBot-Google-Mobile-Apps 262 Szukacz/1.4 263 Oracle Ultra Search 264 ZyBorg* 265 Pu_iN* 266 trendeo 267 Spinn3r 268 AhrefsBots 269 Mozilla/5.0(compatible; Baiduspider/2.0; +http... 270 Telefonica 271 Baiduspider-image+(+http://www.baidu.com/searc... 272 CompSpyBot 273 Zealbot 274 Offline Explorer 275 grub-client 276 grub 277 Facebot 278 Netvibes 279 Clickagy Intelligence Bot v2 280 WebBandit/3.50 281 BLP_bbot 282 Microsoft.URL.Control 283 msnbot 284 ADmantX 285 DittoSpyder 286 Meltwater 287 Go-http-client 288 Y!J-MBS/1.0 289 Alexabot 290 EdisterBot 291 discobot 292 linkfluence 293 GermCrawler 294 puf 295 YandexScreenshotBot 296 Openfind 297 PHP 298 Baiduspider-favo 299 dotbot/1.0 300 NetMechanic 301 True_Robot 302 test-url/1.0 libwww-perl/5.800 303 blinkx 304 Openfind data gatherer 305 Corporama 306 VCI 307 PGBot 308 WebmasterWorldForumBot 309 NextGenSearchBot 310 MIIxpc/4.2 311 NaverBot 312 Link* 313 Scrubby 314 Sogou web spider 315 Sindup 316 FAST Enterprise Crawler 6 / Scirus 317 Swiftbot 318 googlebot 319 CherryPickerSE/1.0 320 twiceler 321 HMSE_Robot 322 YandexSearchShop 323 Microsoft URL Control - 5.01.4511 324 OnetSzukaj 325 Twiceler 326 spbot 327 BaiduMobaider 328 Psbot 329 vebidoobot 330 alexabot 331 trendybuzz 332 FBSearchBot 333 360Spider 334 Libreprensabot/1.0 335 FAST* 336 EtaoSpider 337 Zao 338 test url 339 EroCrawler 340 Newzbin 341 BlowFish/1.0 342 searchpreview 343 Mail.Ru 344 Webster Pro 345 Pingdom 346 CCBot/2.0 347 SemrushBot-SA 348 BaiduImagespider 349 magpie-crawler 350 Synthesio 351 Baiduspider-cpro 352 Baiduspider-ads 353 Web Image Collector 354 Freedom 355 +Baiduspider/2.0 356 Pinterest 357 Teleport Pro 358 CCBot 359 Qwam content intelligence 360 RepoMonkey Bait & Tackle/v1.01 361 wegobot 362 externalfacebookhit 363 score3 364 MegaIndex 365 smspider 366 nutch 367 MJ12bot 368 GurujiBot 369 CheeseBot 370 omgilibot/0.3 371 uipbot 372 MagpieRSS 373 Mozilla/4.0 (compatible; Netcraft Web Server S... 374 RMA 375 Yahoo Pipes 1.0 376 * 377 DotBot* 378 turingos 379 Y!J-SRD/1.0 380 innosense/Nutch-1.0 381 WBSearchBot 382 ContextAd Bot 383 dotbot 384 GwdangSpider 385 Gaisbot 386 Robozilla 387 TrustpilotCrawler 388 YandexMobileBot 389 Googlebot 390 TurnitinBot 391 Seekbot* 392 WebAuto 393 wget 394 duckduckbot 395 Talkwater 396 Nigma.ru 397 Cliqzbot 398 blinkx_ff_spider 399 Iron33/1.0.2 400 Lizard 401 Baiduspider-news 402 Scooter* 403 WebCopier v3.2a 404 Keyword Density/0.9 405 www.integromedb.org/Crawler 406 5emeRue 407 Relcybot 408 gigabot 409 cisco-googlebot-enterprise 410 ichiro/mobile 411 Snapbot/1.0 412 trendictionbot 413 BLP_bbot/0.1 414 berlin-fu-cow 415 Java Browser 416 DOC 417 larbin 418 OrangeBot-Collector 419 Fast corporate crawler 420 InfoNaviRobot 421 Mozilla/5.0 (compatible; Taboolabot/3.7; +http... 422 googlebot-image 423 coexel 424 Microsoft URL Control 425 IDentity 426 AdsBot-Google 427 NerdyBot 428 linguatools 429 WebAlta 430 Balihoo 431 Acunetix Security Scanner 432 VoilaBot 433 aibangspider Disallow: / 434 Xenu's 435 ProWebWalker 436 CRAZYWEBCRAWLER* 437 ia_archiver 438 WebEnhancer 439 Sosospider 440 careerbot 441 WebZIP/5.0 442 SlySearch* 443 MSIECrawler 444 Sogou Orion spider 445 Knowings 446 Website Quester 447 YandexMetrika 448 netseer 449 Googlebot-image 450 Web-By-Mail 451 BecomeBot 452 Mediapartners-Google 453 moget/2.1 454 ADmantX Platform Semantic Analyzer 455 Browsershots 456 Fetch 457 CatchBot 458 BlogSearch 459 Charlotte 460 MoodleBot/1.0 461 yeti 462 Ask n read 463 DuckDuckBot 464 PiplBot 465 Adsbot-Google 466 k2spider 467 bhcBot 468 Sogou inst spider 469 aibang.com Disallow: / 470 Copernic 471 trovitBot 472 leadbox 473 sitebot 474 YandexWebmaster 475 YandexVideoParser 476 Google-HotelAdsVerifier 477 suzuran 478 Jetbot 479 deepcrawl 480 YandexNews 481 ellisphere 482 +Baiduspider 483 *; 484 Solbot 485 CopyRightCheck 486 yisouspider 487 ShopWiki 488 KaBot 489 verticalsearch 490 JobdiggerSpider 491 magpie-crawler/1.1 492 Yahoo! Slurp 493 mytwip 494 YandexMedianaBot 495 Zeus Link Scout 496 YoudaoBot/1.0 497 moget 498 SimplePie/1.1.3 499 WebSauger 500 YandexAccessibilityBot 501 BLEXBot 502 rogerbot 503 Aboundexbot 504 psnsearch 505 WebZip 506 msnbot-media 507 psbot/0.1 508 SWEBot 509 CrazyWebCrawler-Spider 510 WebZIP 511 Adidxbot 512 nsa 513 WebVac 514 WebReaper 515 spanner 516 Open* 517 LinkChecker 518 Applebot 519 MaxPointCrawler 520 Domain Re-Animator Bot 521 HuihuiSpider 522 NewsNow 523 heritrix 524 libWeb/clsHTTP 525 Alexibot 526 WebCopier 527 Crescent 528 Owlin Bot 529 ParadigmCrawler* 530 Baiduspider-image 531 seznambot 532 Radiation Retriever 1.1 533 SQUID_configured_as_described_at_/help/faq/cache 534 YandexDirectDyn 535 Mozilla/5.0 (compatible; Google-Podcast) 536 SpankBot 537 OrangeBot 538 daumoa 539 mention 540 Openfind data gathere 541 Mister PiX 542 asknread.com 543 Mozilla/5.0 (compatible; JikeSpider; +http://s... 544 CherryPicker 545 WebCapture 2.0 546 archive.org_bot 547 toCrawl/UrlDispatcher 548 SiteArc 549 vobsub 550 yahoo-blogs/v3.9 551 Wget/1.5.3 552 RepoMonkey 553 scoop.it 554 Jakarta Commons-HttpClient/3.1 555 QueryN Metasearch 556 google 557 red-app-gsa-p-one 558 asterias 559 Baiduspider/2.0 560 YandexDirect 561 tadcrawler 562 SiteSnagger 563 Search360-Crawler 564 True_Robot/1.0 565 SiteExplorer 566 Callpod Keeper 567 cosmos 568 Krugle 569 UbiCrawler 570 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT;... 571 Xenu 572 Kraken 573 WebZip/4.0 574 endeca 575 TerrawizBot/1.0 576 Twitterbot/1.0 577 IntuitGSACrawler 578 YadirectBot 579 ia_archiver-web.archive.org 580 teoma 581 +Baiduspider/2.0;++http://www.baidu.com/search... 582 AdIdxBot 583 SemrushBot 584 Googlebot-Video 585 Laserlikebot 586 YandexVideo 587 SandDollar 588 cis455crawler 589 Python-urllib 590 kbcrawl 591 MSNBot 592 Wget/1.6 593 YandexCalendar 594 Mail.RU_Bot* 595 BuiltBotTough 596 test-url/1.0 libwww-perl/5.801 597 OmniExplorer_Bot 598 Google-Sitemaps 599 SeznamBot 600 Searchie 601 YahooSeeker/M1A1-R2D2 602 HaoSouSpider 603 WebCopier v.2.2 604 aibang-spider Disallow: / 605 ChangeDetection 606 WebZIP/4.21 607 CrystalSemanticsBot 608 Harvest/1.5 609 Zeus 610 BotRightHere 611 GoogleBot 612 test-url/1.0 libwww-perl/5.803 613 winello 614 MSNPTC/1.0 615 Mozilla/4.0 (compatible; MSIE 6.0; Windows NT;... 616 Twitterbot 617 larbin* 618 Mail.RU_Bot 619 WebStripper 620 Flipboard/3.2.6 CFNetwork/711.0.6 Darwin/14.0.0 621 opinion-tracker 622 Speobot 623 Kenjin Spider 624 LinkScan/8.1a Unix 625 Sogouspider 626 Wandoujia Spider 627 Ezooms/1.0
display(pd.DataFrame(directive_count.items(), columns=['Directive', 'Count']))
Directive | Count | |
---|---|---|
0 | user-agent | 2917 |
1 | disallow | 56575 |
2 | crawl-delay | 97 |
3 | sitemap | 5210 |
4 | allow | 5426 |
5 | clean-param | 46 |
6 | host | 38 |
7 | <!doctype html><html class="en-us no-js " lan... | 1 |
8 | noindex | 502 |
9 | <!doctype html public "-//w3c//dtd xhtml 1.0 s... | 5 |
10 | <html xmlns="http | 9 |
11 | user-agent | 6 |
12 | request-rate | 6 |
13 | visit-time | 1 |
14 | <html style="background | 1 |
15 | {"timestamp" | 1 |
16 | <!-- fd | 1 |
17 | <!doctype html public "-//w3c//dtd xhtml 1.0 t... | 4 |
18 | <!doctype html><html><head><title>ign error 40... | 1 |
19 | <!doctype html /><html><head><title data-react... | 1 |
20 | new date().gettime(),event | 1 |
21 | <!doctype html><html><head><title>apache tomca... | 1 |
22 | <!doctype html><html><head><meta charset="utf-... | 1 |
23 | <!doctype html><html class="no-js" lang="en" d... | 1 |
24 | @media (min-width | 1 |
25 | <!doctype html public "-//w3c//dtd html 4.01 t... | 1 |
26 | disllow | 1 |
27 | <!doctype html public "-//softquad//dtd hotmet... | 1 |
28 | <!doctype html> <html lang="en"> <head> <meta ... | 1 |
29 | <!doctype html><html lang="en"><head><meta cha... | 1 |
30 | var $default_lang="";</script><link rel="style... | 1 |
31 | <!doctype html public "-//w3c//dtd xhtml 1.0 t... | 1 |
32 | <html xmlns="https | 1 |
display(pd.DataFrame(map(lambda x: [x[0], x[1], x[1]*100/website_count], count_per_website.items()), columns=['Directive', 'Number of Websites', '%']))
Directive | Number of Websites | % | |
---|---|---|---|
0 | user-agent | 786 | 81.450777 |
1 | disallow | 749 | 77.616580 |
2 | crawl-delay | 70 | 7.253886 |
3 | sitemap | 436 | 45.181347 |
4 | allow | 348 | 36.062176 |
5 | clean-param | 14 | 1.450777 |
6 | host | 38 | 3.937824 |
7 | <!doctype html><html class="en-us no-js " lan... | 1 | 0.103627 |
8 | noindex | 25 | 2.590674 |
9 | <!doctype html public "-//w3c//dtd xhtml 1.0 s... | 5 | 0.518135 |
10 | <html xmlns="http | 9 | 0.932642 |
11 | user-agent | 6 | 0.621762 |
12 | request-rate | 4 | 0.414508 |
13 | visit-time | 1 | 0.103627 |
14 | <html style="background | 1 | 0.103627 |
15 | {"timestamp" | 1 | 0.103627 |
16 | <!-- fd | 1 | 0.103627 |
17 | <!doctype html public "-//w3c//dtd xhtml 1.0 t... | 4 | 0.414508 |
18 | <!doctype html><html><head><title>ign error 40... | 1 | 0.103627 |
19 | <!doctype html /><html><head><title data-react... | 1 | 0.103627 |
20 | new date().gettime(),event | 1 | 0.103627 |
21 | <!doctype html><html><head><title>apache tomca... | 1 | 0.103627 |
22 | <!doctype html><html><head><meta charset="utf-... | 1 | 0.103627 |
23 | <!doctype html><html class="no-js" lang="en" d... | 1 | 0.103627 |
24 | @media (min-width | 1 | 0.103627 |
25 | <!doctype html public "-//w3c//dtd html 4.01 t... | 1 | 0.103627 |
26 | disllow | 1 | 0.103627 |
27 | <!doctype html public "-//softquad//dtd hotmet... | 1 | 0.103627 |
28 | <!doctype html> <html lang="en"> <head> <meta ... | 1 | 0.103627 |
29 | <!doctype html><html lang="en"><head><meta cha... | 1 | 0.103627 |
30 | var $default_lang="";</script><link rel="style... | 1 | 0.103627 |
31 | <!doctype html public "-//w3c//dtd xhtml 1.0 t... | 1 | 0.103627 |
32 | <html xmlns="https | 1 | 0.103627 |