#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd # In[2]: url='https://en.wikipedia.org/wiki/List_of_International_Space_Station_expeditions' # In[3]: #!pip install beautifulsoup4 # In[2]: import bs4 import requests r=requests.get(url) soup = bs4.BeautifulSoup(r.content) tables=soup.findAll("table") # In[ ]: exps={} ppls={} ppcs={} msns={} cnts={} for trs in [tables[0].findAll("tr"),tables[1].findAll("tr")]: for i,tr in enumerate(trs): if i>0: tds=tr.findAll("td") for j,td in enumerate(tds): aas=td.findAll("a") for a in aas: if a: txt=a.text if j==0: if exp not in exps: exps[exp]=a['href'] elif j==1: print(txt) if not txt: img=a.find('img') cnt=img['alt'] if 'ISS' not in cnt: if cnt not in cnts: cnts[cnt]=img['src'] else: if '[' not in txt: if txt not in ppcs: ppcs[txt]=cnt if txt not in ppls: ppls[txt]=a['href'] else: if txt: if '[' not in txt: if txt not in msns: msns[txt]=a['href'] # In[76]: df=pd.read_html(url) df=pd.concat(df[:2]).reset_index() # In[77]: def find_names(s,ppls,z): nms=s.split(' ') l=2 while l<4: ppl=' '.join(nms[:l]) if ppl in ppls: print(ppl) z.append(ppl) rest=' '.join(nms[l:]) find_names(rest,ppls,z) l=4 l+=1 return z # In[78]: ppls # In[72]: for i in df.index: crew=df.loc[i]['Crew'].replace('\n','') crews=find_names(crew,ppls,[]) if crew in ppls: crews.append(crew) print(crew,crews) # In[359]: df.loc[[i]][['Expedition','Launch date','Duration(days)']]