Implementing a simplified version of the grep command-line utility to search for data in 54 megabytes worth of articles. Our main goals being :
Yarkant_County.html
# To get the file names first:
import os
import pandas as pd
import time
file_names = os.listdir('wiki')
for i in range(10):
print(file_names[i])
print('\n')
## number of files in the folder
print("\033[1m" + 'Number of files in the given folder : ' + "\033[0m" + str(len(file_names)) )
Bay_of_ConcepciC3B3n.html
Bye_My_Boy.html
Valentin_Yanin.html
Kings_XI_Punjab_in_2014.html
William_Harvey_Lillard.html
Radial_Road_3.html
George_Weldrick.html
Zgornji_Otok.html
Blue_Heelers_(season_8).html
Taggen_Nunatak.html
Number of files in the given folder : 999
# Exploring the contents of the first file:
folder_name = "wiki"
file_name = "Bay_of_ConcepciC3B3n.html"
with open(os.path.join(folder_name, file_name)) as f:
print(f.read())
<!DOCTYPE html> <html class="client-nojs" lang="en" dir="ltr"> <head> <meta charset="UTF-8"/> <title>Bay of Concepción - Wikipedia</title> <script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script> <script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRelevantPageName":"Bay_of_Concepción","wgRelevantArticleId":16044270,"wgRequestId":"WKq3wgpAAEIAAMFPZFwAAABQ","wgIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgFlaggedRevsParams":{"tags":{}},"wgStableRevisionId":null,"wgWikiEditorEnabledModules":{"toolbar":true,"dialogs":true,"preview":false,"publish":false},"wgBetaFeaturesFeatures":[],"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","usePageImages":true,"usePageDescriptions":true},"wgPreferredVariant":"en","wgMFDisplayWikibaseDescriptions":{"search":true,"nearby":true,"watchlist":true,"tagline":true},"wgRelatedArticles":null,"wgRelatedArticlesBetaFeatureEnabled":false,"wgRelatedArticlesUseCirrusSearch":true,"wgRelatedArticlesOnlyUseCirrusSearch":false,"wgULSCurrentAutonym":"English","wgNoticeProject":"wikipedia","wgCentralNoticeCookiesToDelete":[],"wgCentralNoticeCategoriesUsingLegacy":["Fundraising","fundraising"],"wgCategoryTreePageCategoryOptions":"{\"mode\":0,\"hideprefix\":20,\"showcount\":true,\"namespaces\":false}","wgCoordinates":{"lat":-36.683333333333,"lon":-73.033333333333},"wgWikibaseItemId":"Q4874197","wgCentralAuthMobileDomain":false,"wgVisualEditorToolbarScrollOffset":0,"wgEditSubmitButtonLabelPublish":false});mw.loader.state({"ext.globalCssJs.user.styles":"ready","ext.globalCssJs.site.styles":"ready","site.styles":"ready","noscript":"ready","user.styles":"ready","user":"ready","user.options":"loading","user.tokens":"loading","ext.cite.styles":"ready","wikibase.client.init":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","ext.wikimediaBadges":"ready","mediawiki.legacy.shared":"ready","mediawiki.legacy.commonPrint":"ready","mediawiki.sectionAnchor":"ready","mediawiki.skinning.interface":"ready","skins.vector.styles":"ready","ext.globalCssJs.user":"ready","ext.globalCssJs.site":"ready"});mw.loader.implement("user.options@0j3lz3q",function($,jQuery,require,module){mw.user.options.set({"variant":"en"});});mw.loader.implement("user.tokens@1dqfd7l",function ( $, jQuery, require, module ) { mw.user.tokens.set({"editToken":"+\\","patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"});/*@nomin*/; });mw.loader.load(["ext.cite.a11y","mediawiki.action.view.postEdit","site","mediawiki.page.startup","mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.legacy.wikibits","mediawiki.searchSuggest","ext.gadget.teahouse","ext.gadget.ReferenceTooltips","ext.gadget.watchlist-notice","ext.gadget.DRN-wizard","ext.gadget.charinsert","ext.gadget.refToolbar","ext.gadget.extra-toolbar-buttons","ext.gadget.switcher","ext.gadget.featured-articles-links","ext.centralauth.centralautologin","mmv.head","mmv.bootstrap.autostart","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.eventLogging.subscriber","ext.wikimediaEvents","ext.navigationTiming","ext.uls.eventlogger","ext.uls.init","ext.uls.interface","ext.quicksurveys.init","ext.centralNotice.geoIP","ext.centralNotice.startUp","skins.vector.js"]);});</script> <link rel="stylesheet" href="/w/load.php?debug=false&lang=en&modules=ext.cite.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cmediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.sectionAnchor%7Cmediawiki.skinning.interface%7Cskins.vector.styles%7Cwikibase.client.init&only=styles&skin=vector"/> <script async="" src="/w/load.php?debug=false&lang=en&modules=startup&only=scripts&skin=vector"></script> <meta name="ResourceLoaderDynamicStyles" content=""/> <link rel="stylesheet" href="/w/load.php?debug=false&lang=en&modules=site.styles&only=styles&skin=vector"/> <meta name="generator" content="MediaWiki 1.29.0-wmf.12"/> <meta name="referrer" content="origin-when-cross-origin"/> <meta property="og:image" content="https://upload.wikimedia.org/wikipedia/commons/9/9d/Txu-oclc-224571178-sj18-04-quiriquina.jpg"/> <link rel="alternate" href="android-app://org.wikipedia/http/en.m.wikipedia.org/wiki/Bay_of_Concepci%C3%B3n"/> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Bay_of_Concepci%C3%B3n&action=edit"/> <link rel="edit" title="Edit this page" href="/w/index.php?title=Bay_of_Concepci%C3%B3n&action=edit"/> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"/> <link rel="shortcut icon" href="/static/favicon/wikipedia.ico"/> <link rel="search" type="application/opensearchdescription+xml" href="/w/opensearch_desc.php" title="Wikipedia (en)"/> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"/> <link rel="copyright" href="//creativecommons.org/licenses/by-sa/3.0/"/> <link rel="canonical" href="https://en.wikipedia.org/wiki/Bay_of_Concepci%C3%B3n"/> <link rel="dns-prefetch" href="//login.wikimedia.org"/> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> </head> <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-Bay_of_Concepción rootpage-Bay_of_Concepción skin-vector action-view"> <div id="mw-page-base" class="noprint"></div> <div id="mw-head-base" class="noprint"></div> <div id="content" class="mw-body" role="main"> <a id="top"></a> <div id="siteNotice"><!-- CentralNotice --></div> <div class="mw-indicators"> </div> <h1 id="firstHeading" class="firstHeading" lang="en">Bay of Concepción</h1> <div id="bodyContent" class="mw-body-content"> <div id="siteSub">From Wikipedia, the free encyclopedia</div> <div id="contentSub"></div> <div id="jump-to-nav" class="mw-jump"> Jump to: <a href="#mw-head">navigation</a>, <a href="#p-search">search</a> </div> <div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr"><div class="thumb tright"> <div class="thumbinner" style="width:202px;"><a href="/wiki/File:Txu-oclc-224571178-sj18-04-quiriquina.jpg" class="image"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Txu-oclc-224571178-sj18-04-quiriquina.jpg/200px-Txu-oclc-224571178-sj18-04-quiriquina.jpg" width="200" height="248" class="thumbimage" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Txu-oclc-224571178-sj18-04-quiriquina.jpg/300px-Txu-oclc-224571178-sj18-04-quiriquina.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Txu-oclc-224571178-sj18-04-quiriquina.jpg/400px-Txu-oclc-224571178-sj18-04-quiriquina.jpg 2x" data-file-width="960" data-file-height="1192" /></a> <div class="thumbcaption"> <div class="magnify"><a href="/wiki/File:Txu-oclc-224571178-sj18-04-quiriquina.jpg" class="internal" title="Enlarge"></a></div> Region of BioBio</div> </div> </div> <p>The <b>Bay of Concepción</b> is a natural bay on the coast of the <a href="/wiki/Concepci%C3%B3n_Province,_Chile" title="Concepción Province, Chile">Province of Concepción</a> in the <a href="/wiki/B%C3%ADo_B%C3%ADo_Region" title="Bío Bío Region">Bío Bío Region</a> of <a href="/wiki/Chile" title="Chile">Chile</a>. Within the bay are many of the most important ports of the region and the country, among them <a href="/wiki/Penco" title="Penco">Penco</a>, <a href="/wiki/Talcahuano" title="Talcahuano">Talcahuano</a>, and Lirquén.</p> <p><a href="/wiki/Quiriquina_Island" title="Quiriquina Island">Quiriquina Island</a>, located to the north in the mouth of the bay provides a windbreak. The island creates two entrances to the bay: Boca Chica and Boca Grande. Boca Chica, between Quriquina Island and the Peninsula of Tumbes, measures 2 km wide and in its narrower part 1,500 metres, with shoals to the sides and although water depth is 15 metres, the passage of large ships is reduced to 400 metres.<sup id="cite_ref-Espinoza.2C_Enrique_1897_1-0" class="reference"><a href="#cite_note-Espinoza.2C_Enrique_1897-1">[1]</a></sup> Boca Grande, is 5 km wide, with depths of 35 metres, which makes it comodious for large vessels.<sup id="cite_ref-Espinoza.2C_Enrique_1897_1-1" class="reference"><a href="#cite_note-Espinoza.2C_Enrique_1897-1">[1]</a></sup></p> <p>The sector of the bay where the Port of Talcahuano is located is known as the Bay of Talcahuano, and is protected by the Peninsula of Tumbes and Quiriquina Island.<sup id="cite_ref-Espinoza.2C_Enrique_1897_1-2" class="reference"><a href="#cite_note-Espinoza.2C_Enrique_1897-1">[1]</a></sup></p> <h2><span class="mw-headline" id="References">References</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Bay_of_Concepci%C3%B3n&action=edit&section=1" title="Edit section: References">edit</a><span class="mw-editsection-bracket">]</span></span></h2> <ol class="references"> <li id="cite_note-Espinoza.2C_Enrique_1897-1"><span class="mw-cite-backlink">^ <a href="#cite_ref-Espinoza.2C_Enrique_1897_1-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Espinoza.2C_Enrique_1897_1-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Espinoza.2C_Enrique_1897_1-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text">Espinoza, Enrique; 1897. Geografía Descriptiva de la República de Chile. Cuarta edición, Imprenta y encuadernación Barcelona, Santiago, Chile.</span></li> </ol> <p><span style="font-size: small;"><span id="coordinates"><a href="/wiki/Geographic_coordinate_system" title="Geographic coordinate system">Coordinates</a>: <span class="plainlinks nourlexpansion"><a class="external text" href="//tools.wmflabs.org/geohack/geohack.php?pagename=Bay_of_Concepci%C3%B3n&params=36_41_S_73_02_W_region:CL_source:kolossus-ruwiki"><span class="geo-default"><span class="geo-dms" title="Maps, aerial photos, and other data for this location"><span class="latitude">36°41′S</span> <span class="longitude">73°02′W</span></span></span><span class="geo-multi-punct"> / </span><span class="geo-nondefault"><span class="geo-dec" title="Maps, aerial photos, and other data for this location">36.683°S 73.033°W</span><span style="display:none"> / <span class="geo">-36.683; -73.033</span></span></span></a></span></span></span></p> <p><br /></p> <table class="metadata plainlinks stub" role="presentation" style="background:transparent"> <tr> <td><a href="/wiki/File:Flag_of_Biob%C3%ADo_Region,_Chile.svg" class="image"><img alt="Stub icon" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Flag_of_Biob%C3%ADo_Region%2C_Chile.svg/40px-Flag_of_Biob%C3%ADo_Region%2C_Chile.svg.png" width="40" height="27" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Flag_of_Biob%C3%ADo_Region%2C_Chile.svg/60px-Flag_of_Biob%C3%ADo_Region%2C_Chile.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Flag_of_Biob%C3%ADo_Region%2C_Chile.svg/80px-Flag_of_Biob%C3%ADo_Region%2C_Chile.svg.png 2x" data-file-width="600" data-file-height="400" /></a></td> <td><i>This <a href="/wiki/B%C3%ADo_B%C3%ADo_Region" title="Bío Bío Region">Bío Bío Region</a> location article is a <a href="/wiki/Wikipedia:Stub" title="Wikipedia:Stub">stub</a>. You can help Wikipedia by <a class="external text" href="//en.wikipedia.org/w/index.php?title=Bay_of_Concepci%C3%B3n&action=edit">expanding it</a>.</i> <div class="plainlinks hlist navbar mini" style="position: absolute; right: 15px; display: none;"> <ul> <li class="nv-view"><a href="/wiki/Template:B%C3%ADoB%C3%ADo-geo-stub" title="Template:BíoBío-geo-stub"><abbr title="View this template">v</abbr></a></li> <li class="nv-talk"><a href="/wiki/Template_talk:B%C3%ADoB%C3%ADo-geo-stub" title="Template talk:BíoBío-geo-stub"><abbr title="Discuss this template">t</abbr></a></li> <li class="nv-edit"><a class="external text" href="//en.wikipedia.org/w/index.php?title=Template:B%C3%ADoB%C3%ADo-geo-stub&action=edit"><abbr title="Edit this template">e</abbr></a></li> </ul> </div> </td> </tr> </table> <!-- NewPP limit report Parsed by mw1251 Cached time: 20170208034214 Cache expiry: 2592000 Dynamic content: false CPU time usage: 0.040 seconds Real time usage: 0.057 seconds Preprocessor visited node count: 98/1000000 Preprocessor generated node count: 0/1500000 Post‐expand include size: 4419/2097152 bytes Template argument size: 0/2097152 bytes Highest expansion depth: 3/40 Expensive parser function count: 0/500 Lua time usage: 0.016/10.000 seconds Lua memory usage: 813 KB/50 MB --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 38.984 1 -total 73.51% 28.658 1 Template:Coord 26.14% 10.189 1 Template:Biobío-geo-stub 21.36% 8.328 1 Template:Asbox --> <!-- Saved in parser cache with key enwiki:pcache:idhash:16044270-0!*!0!*!*!4!* and timestamp 20170208034214 and revision id 647460156 --> <noscript><img src="//en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" title="" width="1" height="1" style="border: none; position: absolute;" /></noscript></div> <div class="printfooter"> Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Bay_of_Concepción&oldid=647460156">https://en.wikipedia.org/w/index.php?title=Bay_of_Concepción&oldid=647460156</a>" </div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Landforms_of_B%C3%ADo_B%C3%ADo_Region" title="Category:Landforms of Bío Bío Region">Landforms of Bío Bío Region</a></li><li><a href="/wiki/Category:Bays_of_Chile" title="Category:Bays of Chile">Bays of Chile</a></li><li><a href="/wiki/Category:B%C3%ADo_B%C3%ADo_Region_geography_stubs" title="Category:Bío Bío Region geography stubs">Bío Bío Region geography stubs</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:Coordinates_on_Wikidata" title="Category:Coordinates on Wikidata">Coordinates on Wikidata</a></li><li><a href="/wiki/Category:All_stub_articles" title="Category:All stub articles">All stub articles</a></li></ul></div></div> <div class="visualClear"></div> </div> </div> <div id="mw-navigation"> <h2>Navigation menu</h2> <div id="mw-head"> <div id="p-personal" role="navigation" class="" aria-labelledby="p-personal-label"> <h3 id="p-personal-label">Personal tools</h3> <ul> <li id="pt-anonuserpage">Not logged in</li><li id="pt-anontalk"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n">Talk</a></li><li id="pt-anoncontribs"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y">Contributions</a></li><li id="pt-createaccount"><a href="/w/index.php?title=Special:CreateAccount&returnto=Bay+of+Concepci%C3%B3n" title="You are encouraged to create an account and log in; however, it is not mandatory">Create account</a></li><li id="pt-login"><a href="/w/index.php?title=Special:UserLogin&returnto=Bay+of+Concepci%C3%B3n" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o">Log in</a></li> </ul> </div> <div id="left-navigation"> <div id="p-namespaces" role="navigation" class="vectorTabs" aria-labelledby="p-namespaces-label"> <h3 id="p-namespaces-label">Namespaces</h3> <ul> <li id="ca-nstab-main" class="selected"><span><a href="/wiki/Bay_of_Concepci%C3%B3n" title="View the content page [c]" accesskey="c">Article</a></span></li> <li id="ca-talk"><span><a href="/wiki/Talk:Bay_of_Concepci%C3%B3n" title="Discussion about the content page [t]" accesskey="t" rel="discussion">Talk</a></span></li> </ul> </div> <div id="p-variants" role="navigation" class="vectorMenu emptyPortlet" aria-labelledby="p-variants-label"> <h3 id="p-variants-label"> <span>Variants</span><a href="#"></a> </h3> <div class="menu"> <ul> </ul> </div> </div> </div> <div id="right-navigation"> <div id="p-views" role="navigation" class="vectorTabs" aria-labelledby="p-views-label"> <h3 id="p-views-label">Views</h3> <ul> <li id="ca-view" class="selected"><span><a href="/wiki/Bay_of_Concepci%C3%B3n" >Read</a></span></li> <li id="ca-edit"><span><a href="/w/index.php?title=Bay_of_Concepci%C3%B3n&action=edit" title="Edit this page [e]" accesskey="e">Edit</a></span></li> <li id="ca-history" class="collapsible"><span><a href="/w/index.php?title=Bay_of_Concepci%C3%B3n&action=history" title="Past revisions of this page [h]" accesskey="h">View history</a></span></li> </ul> </div> <div id="p-cactions" role="navigation" class="vectorMenu emptyPortlet" aria-labelledby="p-cactions-label"> <h3 id="p-cactions-label"><span>More</span><a href="#"></a></h3> <div class="menu"> <ul> </ul> </div> </div> <div id="p-search" role="search"> <h3> <label for="searchInput">Search</label> </h3> <form action="/w/index.php" id="searchform"> <div id="simpleSearch"> <input type="search" name="search" placeholder="Search Wikipedia" title="Search Wikipedia [f]" accesskey="f" id="searchInput"/><input type="hidden" value="Special:Search" name="title"/><input type="submit" name="fulltext" value="Search" title="Search Wikipedia for this text" id="mw-searchButton" class="searchButton mw-fallbackSearchButton"/><input type="submit" name="go" value="Go" title="Go to a page with this exact name if it exists" id="searchButton" class="searchButton"/> </div> </form> </div> </div> </div> <div id="mw-panel"> <div id="p-logo" role="banner"><a class="mw-wiki-logo" href="/wiki/Main_Page" title="Visit the main page"></a></div> <div class="portal" role="navigation" id='p-navigation' aria-labelledby='p-navigation-label'> <h3 id='p-navigation-label'>Navigation</h3> <div class="body"> <ul> <li id="n-mainpage-description"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z">Main page</a></li><li id="n-contents"><a href="/wiki/Portal:Contents" title="Guides to browsing Wikipedia">Contents</a></li><li id="n-featuredcontent"><a href="/wiki/Portal:Featured_content" title="Featured content – the best of Wikipedia">Featured content</a></li><li id="n-currentevents"><a href="/wiki/Portal:Current_events" title="Find background information on current events">Current events</a></li><li id="n-randompage"><a href="/wiki/Special:Random" title="Load a random article [x]" accesskey="x">Random article</a></li><li id="n-sitesupport"><a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en" title="Support us">Donate to Wikipedia</a></li><li id="n-shoplink"><a href="//shop.wikimedia.org" title="Visit the Wikipedia store">Wikipedia store</a></li> </ul> </div> </div> <div class="portal" role="navigation" id='p-interaction' aria-labelledby='p-interaction-label'> <h3 id='p-interaction-label'>Interaction</h3> <div class="body"> <ul> <li id="n-help"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia">Help</a></li><li id="n-aboutsite"><a href="/wiki/Wikipedia:About" title="Find out about Wikipedia">About Wikipedia</a></li><li id="n-portal"><a href="/wiki/Wikipedia:Community_portal" title="About the project, what you can do, where to find things">Community portal</a></li><li id="n-recentchanges"><a href="/wiki/Special:RecentChanges" title="A list of recent changes in the wiki [r]" accesskey="r">Recent changes</a></li><li id="n-contactpage"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia">Contact page</a></li> </ul> </div> </div> <div class="portal" role="navigation" id='p-tb' aria-labelledby='p-tb-label'> <h3 id='p-tb-label'>Tools</h3> <div class="body"> <ul> <li id="t-whatlinkshere"><a href="/wiki/Special:WhatLinksHere/Bay_of_Concepci%C3%B3n" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j">What links here</a></li><li id="t-recentchangeslinked"><a href="/wiki/Special:RecentChangesLinked/Bay_of_Concepci%C3%B3n" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k">Related changes</a></li><li id="t-upload"><a href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u">Upload file</a></li><li id="t-specialpages"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q">Special pages</a></li><li id="t-permalink"><a href="/w/index.php?title=Bay_of_Concepci%C3%B3n&oldid=647460156" title="Permanent link to this revision of the page">Permanent link</a></li><li id="t-info"><a href="/w/index.php?title=Bay_of_Concepci%C3%B3n&action=info" title="More information about this page">Page information</a></li><li id="t-wikibase"><a href="https://www.wikidata.org/wiki/Q4874197" title="Link to connected data repository item [g]" accesskey="g">Wikidata item</a></li><li id="t-cite"><a href="/w/index.php?title=Special:CiteThisPage&page=Bay_of_Concepci%C3%B3n&id=647460156" title="Information on how to cite this page">Cite this page</a></li> </ul> </div> </div> <div class="portal" role="navigation" id='p-coll-print_export' aria-labelledby='p-coll-print_export-label'> <h3 id='p-coll-print_export-label'>Print/export</h3> <div class="body"> <ul> <li id="coll-create_a_book"><a href="/w/index.php?title=Special:Book&bookcmd=book_creator&referer=Bay+of+Concepci%C3%B3n">Create a book</a></li><li id="coll-download-as-rdf2latex"><a href="/w/index.php?title=Special:Book&bookcmd=render_article&arttitle=Bay+of+Concepci%C3%B3n&returnto=Bay+of+Concepci%C3%B3n&oldid=647460156&writer=rdf2latex">Download as PDF</a></li><li id="t-print"><a href="/w/index.php?title=Bay_of_Concepci%C3%B3n&printable=yes" title="Printable version of this page [p]" accesskey="p">Printable version</a></li> </ul> </div> </div> <div class="portal" role="navigation" id='p-lang' aria-labelledby='p-lang-label'> <h3 id='p-lang-label'>Languages</h3> <div class="body"> <ul> <li class="interlanguage-link interwiki-es"><a href="https://es.wikipedia.org/wiki/Bah%C3%ADa_de_Concepci%C3%B3n" title="Bahía de Concepción – Spanish" lang="es" hreflang="es" class="interlanguage-link-target">Español</a></li><li class="interlanguage-link interwiki-fr"><a href="https://fr.wikipedia.org/wiki/Baie_de_Concepci%C3%B3n" title="Baie de Concepción – French" lang="fr" hreflang="fr" class="interlanguage-link-target">Français</a></li><li class="interlanguage-link interwiki-ru"><a href="https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BD%D1%81%D0%B5%D0%BF%D1%81%D1%8C%D0%BE%D0%BD_(%D0%B7%D0%B0%D0%BB%D0%B8%D0%B2)" title="Консепсьон (залив) – Russian" lang="ru" hreflang="ru" class="interlanguage-link-target">Русский</a></li><li class="interlanguage-link interwiki-uk"><a href="https://uk.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BD%D1%81%D0%B5%D0%BF%D1%81%D1%8C%D0%B9%D0%BE%D0%BD_(%D0%B7%D0%B0%D1%82%D0%BE%D0%BA%D0%B0)" title="Консепсьйон (затока) – Ukrainian" lang="uk" hreflang="uk" class="interlanguage-link-target">Українська</a></li> </ul> <div class='after-portlet after-portlet-lang'><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Q4874197#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </div> <div id="footer" role="contentinfo"> <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last modified on 16 February 2015, at 22:18.</li> <li id="footer-info-copyright">Text is available under the <a rel="license" href="//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License">Creative Commons Attribution-ShareAlike License</a><a rel="license" href="//creativecommons.org/licenses/by-sa/3.0/" style="display:none;"></a>; additional terms may apply. By using this site, you agree to the <a href="//wikimediafoundation.org/wiki/Terms_of_Use">Terms of Use</a> and <a href="//wikimediafoundation.org/wiki/Privacy_policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a href="//www.wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://wikimediafoundation.org/wiki/Privacy_policy" class="extiw" title="wmf:Privacy policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About" title="Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimer"><a href="/wiki/Wikipedia:General_disclaimer" title="Wikipedia:General disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-developers"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute">Developers</a></li> <li id="footer-places-cookiestatement"><a href="https://wikimediafoundation.org/wiki/Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Bay_of_Concepci%C3%B3n&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"> <a href="https://wikimediafoundation.org/"><img src="/static/images/wikimedia-button.png" srcset="/static/images/wikimedia-button-1.5x.png 1.5x, /static/images/wikimedia-button-2x.png 2x" width="88" height="31" alt="Wikimedia Foundation"/></a> </li> <li id="footer-poweredbyico"> <a href="//www.mediawiki.org/"><img src="/static/images/poweredby_mediawiki_88x31.png" alt="Powered by MediaWiki" srcset="/static/images/poweredby_mediawiki_132x47.png 1.5x, /static/images/poweredby_mediawiki_176x62.png 2x" width="88" height="31"/></a> </li> </ul> <div style="clear:both"></div> </div> <script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgPageParseReport":{"limitreport":{"cputime":"0.040","walltime":"0.057","ppvisitednodes":{"value":98,"limit":1000000},"ppgeneratednodes":{"value":0,"limit":1500000},"postexpandincludesize":{"value":4419,"limit":2097152},"templateargumentsize":{"value":0,"limit":2097152},"expansiondepth":{"value":3,"limit":40},"expensivefunctioncount":{"value":0,"limit":500},"entityaccesscount":{"value":1,"limit":400},"timingprofile":["100.00% 38.984 1 -total"," 73.51% 28.658 1 Template:Coord"," 26.14% 10.189 1 Template:Biobío-geo-stub"," 21.36% 8.328 1 Template:Asbox"]},"scribunto":{"limitreport-timeusage":{"value":"0.016","limit":"10.000"},"limitreport-memusage":{"value":832932,"limit":52428800}},"cachereport":{"origin":"mw1251","timestamp":"20170208034214","ttl":2592000,"transientcontent":false}}});});</script><script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgBackendResponseTime":51,"wgHostname":"mw1271"});});</script> </body> </html>
The map_reduce( ) function : We have already built this function to divide the data into chunks and analyse it chunk by chunk in parallel for faster processing of the data.
# The function :
import math
import functools
from multiprocessing import Pool
def make_chunks(data, num_chunks):
chunk_size = math.ceil(len(data) / num_chunks)
return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
def map_reduce(data, num_processes, mapper, reducer):
chunks = make_chunks(data, num_processes)
pool = Pool(num_processes)
chunk_results = pool.map(mapper, chunks)
return functools.reduce(reducer, chunk_results)
# First let's create a function to get the whole file at once :
def get_file(file_name):
folder_name = "wiki"
with open(os.path.join(folder_name, file_name)) as f:
lines = [line for line in f.readlines()]
return (lines)
We need to create mapper and reducer function to feed into the *map_reduce( )* function:
# mapper function to get the number of lines in each chunk :
def mapper(chunks):
length = 0
for file_name in chunks:
length += len(get_file(file_name))
return length
# reducer function to combine the length of each chunk :
def reducer(length_chunk_1,length_chunk_2):
final_length = length_chunk_1 + length_chunk_2
return final_length
As there are around
1000
files in file_names and let's take the number of processes as 5:
# finally calculating the total number of lines using map_reduce() :
start = time.time()
total_lines = map_reduce(file_names, 5, mapper, reducer)
end = time.time()
time_taken_parallel = end - start
print("\033[1m" + 'The total number of lines in all files : ' + "\033[0m" +
str(total_lines) )
print("\033[1m" + 'The total time(seconds) taken for the query to process in parallel : '
+ "\033[0m" + str(time_taken_parallel) )
The total number of lines in all files : 499797 The total time(seconds) taken for the query to process in parallel : 0.15456461906433105
## Let's check how our function performed as compared to query in series:
start = time.time()
lines = 0
for file in file_names:
lines += len(get_file(file))
end = time.time()
time_taken_series = end - start
print("\033[1m" + 'The total number of lines in all files : ' + "\033[0m" +
str(lines) )
print("\033[1m" + 'The total time(seconds) taken for the query to process in series : '
+ "\033[0m" + str(time_taken_series) )
The total number of lines in all files : 499797 The total time(seconds) taken for the query to process in series : 0.22296643257141113
## Let's see how fast our query was while using map reduce :
print("\033[1m" + 'Speed advantage while using Map_reduce (x times) : '
+ "\033[0m" + str(time_taken_series/time_taken_parallel) )
Speed advantage while using Map_reduce (x times) : 1.4425450916332325
data
in the wiki
folder¶# mapper function to create and store the values in a dictionary:
def mapper(chunks):
occurrences = {}
for file_name in chunks:
in_line = []
line_count = 0
lines = get_file(file_name)
for line in lines:
line_count += 1
if 'data' in line:
in_line.append(line_count)
occurrences[file_name] = in_line
return occurrences
# reducer function to combine the dictionary from each chunk :
def reducer(dict_1,dict_2):
merged_dict = {}
merged_dict.update(dict_1)
merged_dict.update(dict_2)
return merged_dict
# total occurences of 'data' using map_reduce() :
start = time.time()
total_occurrences = map_reduce(file_names, 5, mapper, reducer)
end = time.time()
time_taken_parallel = end - start
print("\033[1m" + 'The total time(seconds) taken for the query to process in parallel : '
+ "\033[0m" + str(time_taken_parallel) )
The total time(seconds) taken for the query to process in parallel : 0.14320945739746094
# counting the total occurences of 'data' withour map_reduce :
start = time.time()
occurrences = {}
for file_name in file_names:
in_line = []
line_count = 0
lines = get_file(file_name)
for line in lines:
line_count += 1
if 'data' in line:
in_line.append(line_count)
occurrences[file_name] = in_line
end = time.time()
time_taken_series = end - start
print("\033[1m" + 'The total time(seconds) taken for the query to process in series : '
+ "\033[0m" + str(time_taken_series) )
# Let's see how fast our query was while using map reduce :
print("\033[1m" + 'Speed advantage while using Map_reduce (x times) : '
+ "\033[0m" + str(time_taken_series/time_taken_parallel) )
The total time(seconds) taken for the query to process in series : 0.392749547958374 Speed advantage while using Map_reduce (x times) : 2.7424833184609034
Now let's compare the dictionaries :
# by map_reduce :
print("\033[1m" + 'The dictionary with map_reduce : '
+ "\033[0m" )
keys = [file_names[key] for key in range(10)]
for key in keys:
print(str(key) + ' : ' + str(total_occurrences[key]))
# without map_reduce:
print('\n')
print("\033[1m" + 'The dictionary without map_reduce : '
+ "\033[0m" )
for key in keys:
print(str(key) + ' : ' + str(occurrences[key]))
The dictionary with map_reduce : Bay_of_ConcepciC3B3n.html : [7, 46, 59, 61, 63, 106, 189, 206] Bye_My_Boy.html : [277, 360, 377] Valentin_Yanin.html : [102, 145, 228, 245] Kings_XI_Punjab_in_2014.html : [222, 230, 238, 246, 254, 270, 278, 294, 302, 318, 326, 342, 375, 377, 382, 384, 389, 391, 396, 398, 403, 565, 648, 665] William_Harvey_Lillard.html : [46, 66, 82, 130, 213, 230] Radial_Road_3.html : [53, 104, 302, 506, 589, 606] George_Weldrick.html : [195, 278, 295] Zgornji_Otok.html : [7, 54, 56, 66, 70, 212, 261, 263, 312, 395, 412] Blue_Heelers_(season_8).html : [50, 80, 83, 106, 108, 126, 128, 134, 136, 142, 144, 661, 696, 731, 740, 887, 970, 987] Taggen_Nunatak.html : [7, 45, 47, 49, 94, 177, 194] The dictionary without map_reduce : Bay_of_ConcepciC3B3n.html : [7, 46, 59, 61, 63, 106, 189, 206] Bye_My_Boy.html : [277, 360, 377] Valentin_Yanin.html : [102, 145, 228, 245] Kings_XI_Punjab_in_2014.html : [222, 230, 238, 246, 254, 270, 278, 294, 302, 318, 326, 342, 375, 377, 382, 384, 389, 391, 396, 398, 403, 565, 648, 665] William_Harvey_Lillard.html : [46, 66, 82, 130, 213, 230] Radial_Road_3.html : [53, 104, 302, 506, 589, 606] George_Weldrick.html : [195, 278, 295] Zgornji_Otok.html : [7, 54, 56, 66, 70, 212, 261, 263, 312, 395, 412] Blue_Heelers_(season_8).html : [50, 80, 83, 106, 108, 126, 128, 134, 136, 142, 144, 661, 696, 731, 740, 887, 970, 987] Taggen_Nunatak.html : [7, 45, 47, 49, 94, 177, 194]
We see that both the dictionaries are essentially the same, with map_reduce( ) providing us major processing time advantage.
In order to make the function case insensitive, we just a small modification to read the lines in lowercase :
# mapper function to create and store the values in a dictionary:
def mapper(chunks):
occurrences = {}
for file_name in chunks:
in_line = []
line_count = 0
lines = get_file(file_name)
for line in lines:
# converting the line into lowercase
line = line.lower()
line_count += 1
if 'data' in line:
in_line.append(line_count)
occurrences[file_name] = in_line
return occurrences
# reducer function to combine the dictionary from each chunk :
def reducer(dict_1,dict_2):
merged_dict = {}
merged_dict.update(dict_1)
merged_dict.update(dict_2)
return merged_dict
# total occurences of 'data' using map_reduce() :
start = time.time()
total_occurrences_lower = map_reduce(file_names, 5, mapper, reducer)
end = time.time()
time_taken_parallel = end - start
print("\033[1m" + 'The total time(seconds) taken for the query to process in parallel : '
+ "\033[0m" + str(time_taken_parallel) )
The total time(seconds) taken for the query to process in parallel : 0.2129971981048584
# counting the total occurences of 'data' withour map_reduce :
start = time.time()
occurrences = {}
for file_name in file_names:
in_line = []
line_count = 0
lines = get_file(file_name)
for line in lines:
# converting the line into lowercase
line = line.lower()
line_count += 1
if 'data' in line:
in_line.append(line_count)
occurrences[file_name] = in_line
end = time.time()
time_taken_series = end - start
print("\033[1m" + 'The total time(seconds) taken for the query to process in series : '
+ "\033[0m" + str(time_taken_series) )
## Let's see how fast our query was while using map reduce :
print("\033[1m" + 'Speed advantage while using Map_reduce (x times) : '
+ "\033[0m" + str(time_taken_series/time_taken_parallel) )
The total time(seconds) taken for the query to process in series : 0.5423007011413574 Speed advantage while using Map_reduce (x times) : 2.5460461732195325
Let's see if there were more matches in the modified function :
for key_1 in total_occurrences:
for key_2 in total_occurrences_lower:
if key_1 == key_2:
if len(total_occurrences_lower[key_2]) > len(total_occurrences[key_1]):
difference = len(total_occurrences_lower[key_2]) - len(total_occurrences[key_1])
print(str(key_1) + ' : ' + str(difference))
Table_Point_Formation.html : 1 Ingrid_GuimarC3A3es.html : 1 Jules_Verne_ATV.html : 2 Pictogram.html : 1 Claire_Danes.html : 2 PTPRS.html : 1 A_Beautiful_Valley.html : 1 Mudramothiram.html : 1 Gordon_Bau.html : 2 Embraer_Unidade_GaviC3A3o_Peixoto_Airport.html : 1 Code_page_1023.html : 3 Cryptographic_primitive.html : 1 Alex_Kurtzman.html : 1 Filip_Pyrochta.html : 1 Morgana_King.html : 1 Don_Parsons_(ice_hockey).html : 1 Bias.html : 1 Tomohiko_ItC58D_(director).html : 2 Imperial_Venus_(film).html : 1 Camp_Nelson_Confederate_Cemetery.html : 1 Benny_Lee.html : 1 Kul_Gul.html : 1 Medicago_murex.html : 1 Oldfield_Baby_Great_Lakes.html : 1 Wilson_Global_Explorer.html : 1 Craig_Chester.html : 1 Derek_Acorah.html : 1 Jack_Goes_Home.html : 1 Morning_Glory_(2010_film).html : 1 Tim_Spencer_(singer).html : 1 Lower_Blackburn_Grade_Bridge.html : 1 1953E2809354_FA_Cup_qualifying_rounds.html : 1 Sol_Eclipse.html : 1 Jonathan_A._Goldstein.html : 1 83_(number).html : 1 Devil_on_Horseback.html : 1 Harry_Hill_Bandholtz.html : 1 Shpolskii_matrix.html : 2 Dragnet_(franchise).html : 6 Qalat_Kat.html : 1 Maniitsoq_structure.html : 3 Ordinary_Virginia.html : 1 Dewoitine_D.21.html : 1 Furto_di_sera_bel_colpo_si_spera.html : 1 Rudy_The_Rudy_Giuliani_Story.html : 1 Exploratorium_(film).html : 1 Foulonia.html : 1 Amborella.html : 1 Rally_for_Democracy_and_Progress_(Benin).html : 1 Swathi_Chinukulu.html : 1 Precorrin6A_reductase.html : 2 The_Gentleman_Without_a_Residence_(1915_film).html : 1 Manhattan_Murder_Mystery.html : 1 Viva_Villa.html : 2 Companys_procC3A9s_a_Catalunya.html : 1 Avengers_Academy.html : 1 Antibiotic_use_in_livestock.html : 1 Syngenor.html : 1 Cobble_Hill_Brooklyn.html : 1 Typhoon_Hester_(1952).html : 1 WintersWimberley_House.html : 1 Kokan_Colony.html : 1 Wilhelm_Wagenfeld_House.html : 1 Taipa_HousesE28093Museum.html : 2 WLSR.html : 1 Lake_County_Examiner.html : 1 Copamyntis_infusella.html : 1 C11orf30.html : 1 Old_Mill_Creek_Illinois.html : 1 Bahmanabade_Olya.html : 1 Ek_Dil_Sau_Afsane.html : 1 Daniel_Cerone.html : 1 Shoreyjehye_Do.html : 1 Failing_Office_Building.html : 1 Pushkar.html : 1 List_of_Uzbek_films_of_2014.html : 1 KMTZ.html : 1 Golabkhvaran.html : 1 CurtissWright_Hangar_(Columbia_South_Carolina).html : 1 Blue_SWAT.html : 1 Danish_Maritime_Safety_Administration.html : 1 Don_Raye.html : 1 Lis_LC3B8wert.html : 1 Doumanaba.html : 1 Sahanpur.html : 1 Meleh_Kabude_Sofla.html : 1 Panchamrutham.html : 1 Bibiana_Beglau.html : 1 Kattukukke.html : 1 Acceptance_(Heroes).html : 1 Westchester_Los_Angeles.html : 1 Appa_(film).html : 1 HD_90156.html : 1 The_Audacity_to_Podcast.html : 2 Brownfield_(software_development).html : 1 Boardman_Township_Mahoning_County_Ohio.html : 1 King_Parker_House.html : 1 List_of_Spaghetti_Western_films.html : 2 The_Future_(film).html : 1 Weiser_River.html : 1 Jon_Mullich.html : 1 Saravan_Gilan.html : 1 Agaritine_gammaglutamyltransferase.html : 2 Nuno_Leal_Maia.html : 1 Battle_of_Wattignies.html : 1 Colchester_Village_Historic_District.html : 1 Hayateumi_Hidehito.html : 1 List_of_people_from_Bangor_Maine.html : 7 Mirisah.html : 1 Teiji_Ito.html : 1 L._Fry.html : 1 Tropical_sprue.html : 1 Roxbury_Presbyterian_Church.html : 1 Peter_Collingwood.html : 1 List_of_molecular_graphics_systems.html : 4 Functoid.html : 1 Vojin_C486etkoviC487.html : 1 Julien_Boisselier.html : 1 Jazz_in_Turkey.html : 1 Kim_Yonghwa.html : 2 Holly_Golightly_(comics).html : 1 SalemAuburn_Streets_Historic_District.html : 1 Kate_Harwood.html : 2 Gulliver_Mickey.html : 1 Urs_Burkart.html : 1 Smilax_laurifolia.html : 1 Taylor_Williamson.html : 1 Claudia_Neidig.html : 1 Dean_Kukan.html : 1 Demographics_of_American_Samoa.html : 1 C389cole_des_Mines_de_Douai.html : 1 Frost_Township_Michigan.html : 1 Shabbir_Kumar.html : 1 West_Park_Bridge.html : 1
def mapper(chunks):
occurrences = {}
for file_name in chunks:
in_line = []
line_count = 0
lines = get_file(file_name)
for line in lines:
# getting the word count:
word_count = 0
line = line.lower()
line_count += 1
if 'data' in line :
# splitting the lines to get the words
line = str(line).split(' ')
for word in line:
word_count += 1
if 'data' in word:
in_line.append((line_count,word_count))
occurrences[file_name] = in_line
return occurrences
# reducer function to combine the dictionary from each chunk :
def reducer(dict_1,dict_2):
merged_dict = {}
merged_dict.update(dict_1)
merged_dict.update(dict_2)
return merged_dict
# total occurences of 'data' using map_reduce() :
start = time.time()
total_occurrences = map_reduce(file_names, 5, mapper, reducer)
end = time.time()
time_taken = end - start
print("\033[1m" + 'The total time(seconds) taken for the query to process in parallel : '
+ "\033[0m" + str(time_taken) )
The total time(seconds) taken for the query to process in parallel : 0.24758434295654297
# inspecting the dictionary :
print("\033[1m" + 'The dictionary with map_reduce : '
+ "\033[0m" )
keys = [file_names[key] for key in range(10)]
for key in keys:
print(str(key) + ' : ' + str(total_occurrences[key]) + str('\n'))
The dictionary with map_reduce :
Bay_of_ConcepciC3B3n.html : [(7, 5), (46, 15), (46, 16), (59, 22), (59, 39), (61, 2), (63, 13), (63, 14), (106, 4), (106, 41), (106, 44), (106, 46), (189, 71), (189, 75), (189, 79), (206, 6)]
Bye_My_Boy.html : [(277, 4), (360, 71), (360, 75), (360, 79), (377, 6)]
Valentin_Yanin.html : [(102, 18), (145, 4), (228, 71), (228, 75), (228, 79), (245, 6)]
Kings_XI_Punjab_in_2014.html : [(222, 19), (222, 20), (230, 19), (230, 20), (238, 19), (238, 20), (246, 19), (246, 20), (254, 19), (254, 20), (270, 19), (270, 20), (278, 19), (278, 20), (294, 19), (294, 20), (302, 19), (302, 20), (318, 19), (318, 20), (326, 19), (326, 20), (342, 19), (342, 20), (375, 14), (375, 15), (377, 22), (377, 23), (382, 14), (382, 15), (384, 22), (384, 23), (389, 16), (389, 17), (391, 20), (391, 21), (396, 16), (396, 17), (398, 20), (398, 21), (403, 14), (403, 15), (565, 4), (648, 71), (648, 75), (648, 79), (665, 6)]
William_Harvey_Lillard.html : [(46, 11), (46, 12), (66, 10), (82, 10), (130, 4), (213, 71), (213, 75), (213, 79), (230, 6)]
Radial_Road_3.html : [(53, 14), (53, 15), (104, 15), (104, 16), (302, 13), (302, 14), (506, 4), (589, 71), (589, 75), (589, 79), (606, 6)]
George_Weldrick.html : [(195, 4), (278, 71), (278, 75), (278, 79), (295, 6)]
Zgornji_Otok.html : [(7, 11), (54, 24), (54, 25), (56, 14), (56, 15), (66, 18), (66, 35), (66, 65), (66, 82), (70, 14), (70, 15), (212, 25), (212, 26), (261, 2), (263, 13), (263, 14), (312, 4), (312, 57), (312, 60), (312, 62), (395, 71), (395, 75), (395, 79), (412, 6)]
Blue_Heelers_(season_8).html : [(50, 12), (50, 13), (80, 3), (83, 15), (83, 16), (106, 3), (108, 13), (108, 14), (126, 3), (128, 13), (128, 14), (134, 3), (136, 13), (136, 14), (142, 3), (144, 13), (144, 14), (661, 13), (661, 14), (696, 13), (696, 14), (731, 13), (731, 14), (740, 14), (740, 15), (887, 4), (970, 71), (970, 75), (970, 79), (987, 6)]
Taggen_Nunatak.html : [(7, 14), (45, 10), (45, 11), (45, 77), (45, 94), (47, 2), (49, 13), (49, 14), (94, 4), (94, 63), (94, 66), (94, 68), (177, 71), (177, 75), (177, 79), (194, 6)]
# Let's first turn the created dictionary into a pandas dataframe :
main_df = pd.DataFrame()
start = time.time()
for key in total_occurrences:
df = pd.DataFrame(total_occurrences[key],
columns=('Lines','Index'))
df['File'] = key
df = df[['File','Lines','Index']]
main_df = pd.concat([main_df, df], axis=0, ignore_index=True)
end = time.time()
time_taken_series = end - start
print("\033[1m" + 'The total time(seconds) taken for the query to process in series: '
+ "\033[0m" + str(time_taken_series) + '\n' )
print("\033[1m" + 'Few rows of the processed dataframe : '
+ "\033[0m" )
print(main_df.head())
print('\n')
print("\033[1m" + 'Final shape of the dataframe : '
+ "\033[0m" + str(main_df.shape))
The total time(seconds) taken for the query to process in series: 2.487610101699829 Few rows of the processed dataframe : File Lines Index 0 Bay_of_ConcepciC3B3n.html 7 5 1 Bay_of_ConcepciC3B3n.html 46 15 2 Bay_of_ConcepciC3B3n.html 46 16 3 Bay_of_ConcepciC3B3n.html 59 22 4 Bay_of_ConcepciC3B3n.html 59 39 Final shape of the dataframe : (20517, 3)
Let's now create a mapper and reducer function for mapping the data efficiently into a dataframe using *map_reduce( )* :
# mapper function to create and store the values in a dictionary:
def mapper(chunks):
df = pd.DataFrame()
main = pd.DataFrame()
for key in chunks :
df = pd.DataFrame(total_occurrences[key],
columns=('Lines','Index'))
df['File'] = key
df = df[['File','Lines','Index']]
main = pd.concat([main, df], axis=0, ignore_index=True)
return main
# reducer function to combine the dictionary from each chunk :
def reducer(df_1,df_2):
df_1 = pd.concat([df_1, df_2], axis=0, ignore_index=True)
return df_1
# using map_reduce to convert the dictionry into a pandas dataframe:
start = time.time()
df_map = map_reduce(file_names, 5, mapper, reducer)
end = time.time()
time_taken_parallel = end - start
print("\033[1m" + 'The total time(seconds) taken for the query to process in parallel : '
+ "\033[0m" + str(time_taken_parallel) + '\n' )
print("\033[1m" + 'Few rows of the processed dataframe : '
+ "\033[0m" )
print(df_map.head())
print('\n')
print("\033[1m" + 'Final shape of the dataframe : '
+ "\033[0m" + str(df_map.shape))
print('\n')
## Let's see how fast our query was while using map reduce :
print("\033[1m" + 'Speed advantage while using Map_reduce (x times) : '
+ "\033[0m" + str(time_taken_series/time_taken_parallel) )
The total time(seconds) taken for the query to process in parallel : 0.8889327049255371 Few rows of the processed dataframe : File Lines Index 0 Bay_of_ConcepciC3B3n.html 7 5 1 Bay_of_ConcepciC3B3n.html 46 15 2 Bay_of_ConcepciC3B3n.html 46 16 3 Bay_of_ConcepciC3B3n.html 59 22 4 Bay_of_ConcepciC3B3n.html 59 39 Final shape of the dataframe : (20517, 3) Speed advantage while using Map_reduce (x times) : 2.798423421611209