import pandas as pd, numpy as np
url='https://en.wikipedia.org/wiki/List_of_International_Space_Station_expeditions'
import bs4
import requests
r=requests.get(url)
soup = bs4.BeautifulSoup(r.content)
tables=soup.findAll("table")
ppls={}
for trs in [tables[0].findAll("tr"),tables[1].findAll("tr")]:
for i,tr in enumerate(trs):
if i>0:
tds=tr.findAll("td")
for j,td in enumerate(tds):
aas=td.findAll("a")
for a in aas:
if a:
txt=a.text
if txt:
if '[' not in txt:
if txt not in ppls:
#if j==0:
ppls[txt]=a['href']
df=pd.read_html(url)
df=pd.concat(df[:2]).reset_index()
def find_names(s,ppls,z):
nms=s.split(' ')
l=2
while l<4:
ppl=' '.join(nms[:l])
if ppl in ppls:
z.append(ppl)
rest=' '.join(nms[l:])
find_names(rest,ppls,z)
l=4
l+=1
return z
dgs=[]
for i in df.index:
crew=df.loc[i]['Crew'].replace('\n','')
crews=find_names(crew,ppls,[])
for c in crews:
dg=df.loc[[i]][['Expedition','Duration(days)']].copy()
dg['Crew']=c
date=df.loc[i]['Launch date']
date=date.split(' ')
if ',' in date[1]:
date=date[1].replace(',','')+' '+date[0]+' '+date[2][:4]
else:
date=date[0]+' '+date[1]+' '+date[2][:4]
dg['Date']=date
dgs.append(dg)
dgs=pd.concat(dgs).reset_index()
dhs=dgs.set_index(['Expedition','Crew'])
def get_duration(duration):
default='160'
duration=str(duration)
if duration=='nan':
return default
if 'ransfer' in duration:
duration=' '.join(duration.split(' ')[-2:])
try:
duration=dhs.loc[duration].loc[crew]['Duration(days)']
except:
print(crew,duration)
return default
return duration
data=[]
for i in dgs.index:
crew=dgs.loc[i]['Crew']
date=dgs.loc[i]['Date']
if 'ransfer' not in date:
duration=dgs.loc[i]['Duration(days)']
duration=get_duration(duration)
duration=get_duration(duration)
duration=get_duration(duration)
if '[' in duration:
duration=duration[:duration.find('[')]
duration=int(np.round(float(duration.replace('days','').replace('day','').strip()),0))
data.append({'Crew':crew,'Date':date,'Duration':duration})
Mikhail Korniyenko year mission Scott J. Kelly year mission Timothy Peake Expedition 47 Aleksey Ovchinin Expedition 60 Christina Koch Expedition 60 Nick Hague Expedition 60
data=pd.DataFrame(data)
data.head()
Crew | Date | Duration | |
---|---|---|---|
0 | William M. Shepherd | 31 October 2000 | 141 |
1 | Sergei Krikalev | 31 October 2000 | 141 |
2 | Yuri Gidzenko | 31 October 2000 | 141 |
3 | Yuri Usachev | 8 March 2001 | 167 |
4 | James S. Voss | 8 March 2001 | 167 |
links={}
country_map={}
for trs in [tables[0].findAll("tr"),tables[1].findAll("tr")]:
for i,tr in enumerate(trs):
if i>0:
aas=tr.findAll("a")
for j,a in enumerate(aas):
if a:
txt=a.text
if txt:
if '[' not in txt:
links[txt]=a['href']
if txt in data['Crew'].values:
if txt not in country_map:
country=aas[j-1].find('img')['alt']
country_map[txt]=country
else:
if j>1:
country=a.find('img')['alt']
links[country]=a.find('img')['src']
countries=pd.DataFrame(country_map,index=['Country']).T
links=pd.DataFrame(links,index=['Link']).T
data=data.join(countries,on='Crew')
data['Crew_link']=data.join(links,on='Crew')['Link']
data['Country_link']=data.join(links,on='Country')['Link']
data.to_csv('data.csv')
ndata={}
for i in data.index:
start=pd.to_datetime(data.loc[i]['Date'])
periods=data.loc[i]['Duration']
crew=data.loc[i]['Crew']
country=data.loc[i]['Country']
for idate in pd.date_range(start,periods=periods,freq='1D'):
date=str(idate)[:10]
if date not in ndata: ndata[date]={}
if country not in ndata[date]: ndata[date][country]=0
ndata[date][country]+=1
pd.DataFrame(ndata).to_csv('ndata.csv')
import json
open('ndata.json','w').write(json.dumps(ndata))
385280
ndata2=[]
for i in data.index:
start=pd.to_datetime(data.loc[i]['Date'])
periods=data.loc[i]['Duration']
crew=data.loc[i]['Crew']
country=data.loc[i]['Country']
for idate in pd.date_range(start,periods=periods,freq='1D'):
date=str(idate)[:10]
ndata2.append({'Date':date,'Name':crew,'Country':country,'Crew':1})
open('ndata2.json','w').write(json.dumps(ndata2))
2411906
open('ndata2a.json','w').write(json.dumps(ndata2[:1000]))
86441
pd.DataFrame(ndata).T.to_csv('ndataT.csv')
df=pd.DataFrame(ndata).T
df.index.name='Date'
df=df.reset_index()
edata={}
for c in df.columns:
if c!='Date':
edata[c]=list(df[c].values)
else:
edata[c]=list(df[c].values)
edata.keys()
dict_keys(['Date', 'Belgium', 'Canada', 'France', 'Germany', 'Italy', 'Japan', 'Netherlands', 'Russia', 'United Kingdom', 'United States'])
json.dump(edata,open('edata.json','w'))