#!/usr/bin/env python # coding: utf-8 # # urlExpander Quickstart # View this notebook on [NBViewer](http://nbviewer.jupyter.org/github/SMAPPNYU/urlExpander/blob/master/examples/quickstart.ipynb?flush_cache=true) or [Github](https://github.com/SMAPPNYU/urlExpander/blob/master/examples/quickstart.ipynb)| Run it interactively on # [Binder](https://mybinder.org/v2/gh/SMAPPNYU/urlExpander/master?filepath=examples%2Fquickstart.ipynb)
# By [Leon Yin](leonyin.org) for [SMaPP NYU](https://wp.nyu.edu/smapp/) # # # [urlExpander](https://github.com/SMAPPNYU/urlExpander) is a Python package for quickly and thoroughly expanding URLs. # # You can download the software using pip: # In[1]: import urlexpander from runtimestamp.runtimestamp import runtimestamp runtimestamp('QuickStart User') print(f"This notebook is using urlExpander v{urlexpander.__version__}") # Here is a toy example of some URLs taken from Congressional Twitter accounts: # In[2]: urls = [ 'https://trib.al/xXI5ruM', 'http://bit.ly/1Sv81cj', 'https://www.youtube.com/watch?v=8NwKcfXvGl4', 'https://t.co/zNU1eHhQRn', ] # We can use the `expand` function (see the code) to unshorten any link: # In[3]: urlexpander.expand(urls[0]) # It also works on any list of URLs. # In[4]: urlexpander.expand(urls) # To save compute time, we can skip links that don't need to be expanded.
# The `is_short` function takes any url and checks if the domain is from a known list of link shorteners # In[5]: print(f"{urls[1]} returns:") urlexpander.is_short(urls[1]) # bit.ly is probably the best known link shortener, Youtube.com however is not a link shortener! # In[6]: print(f"{urls[2]} returns:") urlexpander.is_short(urls[2]) # urlExpander takes advantage of a list of known domains that offer link shortening services. # In[7]: known_shorteners = urlexpander.constants.all_short_domains.copy() print(len(known_shorteners)) # You can make modifications or use your own `list_of_domains` as an argument for the`is_short` function or `is_short_domain` (which is faster and operates on the domain-level). # In[8]: known_shorteners += ['youtube.com'] # In[9]: print(f"Now {urls[2]} returns:") urlexpander.is_short(urls[2], list_of_domains=known_shorteners) # this is the default # Now we can shorten our workload: # In[10]: # filter only domains that need to be shortenened urls_to_shorten = [link for link in urls if urlexpander.is_short(link)] urls_to_shorten # urlExpander's `multithread_expand()` does heavy lifting to quickly and thoroughly expand a list of links: # In[11]: expanded_urls = urlexpander.expand(urls_to_shorten) expanded_urls # Note that URLs that resolve to defunct pages, still return the domain name -- followed by the type of error surrounded by two underscores IE `http://www.billshusterforcongress.com/__CONNECTIONPOOL_ERROR__`. # # Instead of filtering the inputs before running the `expand` function, you can assign a filter using the `filter_function` argument.
# Filter functions can be any boolean function that operates on a string. Below is an example function that filters for t.co links: # In[12]: def custom_filter(url): '''This function returns True if the url is a shortened Twitter URL''' if urlexpander.get_domain(url) == 't.co': return True else: return False # In[13]: resolved_links = urlexpander.expand(urls, filter_function=custom_filter, verbose=1) resolved_links # Although filtering within the `expand` function is convenient, you will see changes in performance time. # In[15]: resolved_links = urlexpander.expand(urls, filter_function=urlexpander.is_short, verbose=1) resolved_links #
# But that is a toy example, let's see how this fairs with a larger dataset.
# This package comes with a [sampled dataset](https://github.com/SMAPPNYU/urlExpander/blob/master/urlexpander/core/datasets.py#L8-L29) of links extracted from Twitter accounts from the 115th Congress.
# If you work with Twitter data you'll be glad to know there is a function `urlexpander.tweet_utils.get_link` for creating a similar dataset from Tweets. # In[16]: df_congress = urlexpander.datasets.load_congress_twitter_links(nrows=10000) print(f'The dataset has {len(df_congress)} rows') df_congress.tail(2) # In[22]: shortened_urls = df_congress[df_congress.link_domain.apply(urlexpander.is_short)].tweet_id.nunique() all_urls = df_congress.tweet_id.nunique() shortened_urls / all_urls # About 28% of the links are short!
# The performance of the next script is dependent on your internet connection: # In[17]: get_ipython().system('curl -s https://raw.githubusercontent.com/sivel/speedtest-cli/master/speedtest.py | python -') # Let's see how long it takes to expand these 10k links.
# # This is where the optional parameters for `expand` shine. # We can created multiple threads for requests (using `n_workers`), cache results into a json file (`cache_file`), and chunk the input into smaller pieces (using `chunksize`). Why does this last part matter? Something I noticed when expanding links in mass is that performance degrades over time. Chunking the input prevents this from happening (not sure why though)! # In[23]: resolved_links = urlexpander.expand(df_congress['link_url_long'], chunksize=1280, n_workers=64, cache_file='temp.json', verbose=1, filter_function=urlexpander.is_short) # At SMaPP, the process of link expansion has been a burden on our research.
# We hope that this software helps you overcome similar obstacles! # In[24]: df_congress['expanded_url'] = resolved_links df_congress['resolved_domain'] = df_congress['expanded_url'].apply(urlexpander.get_domain) df_congress.tail(2) # Here are the top 25 shared domains from this sampled Congress dataset: # In[25]: df_congress.resolved_domain.value_counts().head(25) #
# # Bonus Round! # You can count number of `resolved_domain`s for each `user_id ` using `count_matrix()`.
# You can even choose which domains are counted by modifying the `domain_list` arg: # In[26]: count_matrix = urlexpander.tweet_utils.count_matrix(df_congress, user_col='user_id', domain_col='resolved_domain', unique_count_col='tweet_id', domain_list=['youtube.com','facebook.com', 'google.com', 'twitter.com']) count_matrix.tail(3) # One of the domain lists you might be interested in are US national media outlets - # `datasets.load_us_national_media_outlets()` compiled by Gregory Eady (Forthcoming). # In[26]: urlexpander.datasets.load_us_national_media_outlets()[:5] #
# We also built a one-size-fits-all scraper that returns the title, description, and/or paragraphs from any given URL. # In[27]: urlexpander.html_utils.get_webpage_title(urls[0]) # In[28]: urlexpander.html_utils.get_webpage_description(urls[0]) # In[29]: urlexpander.html_utils.get_webpage_meta(urls[0]) # ## Conclusion # Thanks for stumbling upon this package, we hope that it will lead to more research around links.
# We're working on some projects in thie vein and would love to know if you are too! # # As an open source package, please feel to reach out about bugs, feature requests, or collaboration!