#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd, numpy as np import bs4, requests, json, os # In[4]: # os.chdir('E:/Onedrive - Lancaster University/datarepo/influence/ro') os.chdir('C:/users/csala/Onedrive - Lancaster University/datarepo/influence/ro') # In[5]: base_url='http://www.cdep.ro' url=base_url+'/pls/parlam/structura2015.ab?idl=1' # In[6]: url # In[7]: r=requests.get(url) soup = bs4.BeautifulSoup(r.content) # In[8]: tables=soup.findAll('table') table=tables[1] # In[9]: links=list(set([l['href'] for l in table.findAll('a')])) # In[10]: def state_format(j): j=j.replace('\n ','') j=j.strip() return j # In[11]: import pickle # In[12]: def save_obj(obj, name ): with open('data/'+ name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(name ): with open('data/' + name + '.pkl', 'rb') as f: return pickle.load(f) # In[135]: members=[] party_imgs={} state_imgs={} parsed_links=set() #if available, load, dont parse load_from_pages=True if load_from_pages: pages=load_obj('pages') else: pages={} # In[136]: for link in links: if link not in parsed_links: if len(parsed_links)%100==0: print(len(parsed_links)/len(links)*100,'%') idm=link[link.find('idm=')+4:link.find('idm=')+4+link[link.find('idm=')+4:].find('&')] leg=link[link.find('leg=')+4:link.find('leg=')+4+link[link.find('leg=')+4:].find('&')] url=base_url+link if not load_from_pages: r=requests.get(url) soup = bs4.BeautifulSoup(r.content) pages[link]=str(soup) else: soup=bs4.BeautifulSoup(pages[link]) name=soup.find('title').text olddiv=soup.find('div',{'id':'olddiv'}) pretty_name=olddiv.find('h1').text img=olddiv.find('img')['src'] divs=soup.find('div',{'id':'olddiv'}).find('div').findAll('div',{'class':'boxDep'}) camera=divs[0].find('h3').text ikamera=link[-1] judet=divs[0].find('a').text birth_ro=soup.find('div',{'class':'profile-pic-dep'}).text.replace('\n','').replace('n.','').strip() start='' end='' if 'data valid' in repr(divs[0]): start=repr(divs[0])[repr(divs[0]).find('data valid')+14:] start=start[:start.find('<')] start=start.replace(':','').replace('-','').strip() if 'mandatului' in repr(divs[0]): end=repr(divs[0])[repr(divs[0]).find('mandatului')+10:] end=end[:end.find('<')] end=end.replace(':','').replace('-','').strip() comisii=[] comisii_abbr=[] parties=[] parties_abbr=[] gparties=[] gparties_abbr=[] state=[] state_abbr=[] state_img=[] activitate=[] def divformat(div): #return div.text.lower().replace('subcomisia ','#SUB#comisia ').replace('comisiei ','comisia ').split('comisia ')[1:] return div.text.lower().replace('subcomisia ','comisia ').replace('comisiei ','comisia ').split('comisia ')[1:] for div in divs: header=div.find('h3').text if 'omisii permanente' in header: comisii+=['Comisia '+j.replace('\n','').strip() for j in divformat(div)] comisii_abbr+=[j.text for j in div.findAll('a')] if 'Biroului Permanent' in header: comisii+=['#BP#'+i.text for i in divs[1].findAll('tr')] comisii_abbr+=['#BP#'+ikamera for i in divs[1].findAll('tr')] if 'omisii special' in header: comisii+=['#SPEC#'+'Comisia '+j.replace('\n','').strip() for j in divformat(div)] comisii_abbr+=['#SPEC#'+j.text for j in div.findAll('a')] if 'omisii de anch' in header: comisii+=['#ANCH#'+'Comisia '+j.replace('\n','').strip() for j in divformat(div)] comisii_abbr+=['#ANCH#'+j.text for j in div.findAll('a')] if 'lte comisii' in header: comisii+=['Comisia '+j.replace('\n','').strip() for j in divformat(div)] comisii_abbr+=[j.text for j in div.findAll('a')] if 'iunea politic' in header: parties=[j for j in div.findAll('tr',{'valign':'center'}) if j.findAll('table')] if 'upul parlamentar:' in header: if div.find('table'): gparties=['#GRUP#Grupul parlamentar'+j.replace('\n','').strip() for j in div.find('table').text.replace('Senator','Grupul parlamentar')\ .replace('Deputa','Grupul parlamentar').replace('Grupul Parlamentar','Grupul parlamentar')\ .split('Grupul parlamentar')][1:] if 'altor state' in header: states=div.findAll('tr') state+=[state_format(states[j].text) for j in range(len(states))] state_abbr+=['' if states[j].find('a')==None else states[j].find('a').text for j in range(len(states))] state_img+=['' if states[j].find('img')==None else states[j].find('img')['src'] for j in range(len(states))] if 'mentare interna' in header: states=div.findAll('tr') state+=['#INTER#'+state_format(states[j].text) for j in range(len(states))] state_abbr+=['' if states[j].find('a')==None else '#INTER#'+states[j].find('a').text for j in range(len(states))] state_img+=['' if states[j].find('img')==None else '#INTER#'+states[j].find('img')['src'] for j in range(len(states))] if 'lte grupuri' in header: states=div.findAll('tr') state+=[state_format(states[j].text) for j in range(len(states))] state_abbr+=['' if states[j].find('a')==None else states[j].find('a').text for j in range(len(states))] state_img+=['' if states[j].find('img')==None else states[j].find('img')['src'] for j in range(len(states))] if 'cifre' in header: activitate=[j.text.split(':') for j in div.findAll('tr') if j.text.split(':')!=['']] party=[parties[j].find('table').text for j in range(len(parties))] party_abbr=['' if parties[j].find('table').find('a')==None else parties[j].find('table').find('a').text for j in range(len(parties))] party_img=['' if parties[j].find('img')==None else parties[j].find('img')['src'] for j in range(len(parties))] party+=gparties party_abbr+=['' for i in range(len(gparties))] party_img+=['' for i in range(len(gparties))] for i in range(len(state_img)): s=state_img[i] t=state_abbr[i] if t not in state_imgs:state_imgs[t]=s for i in range(len(party_img)): s=party_img[i] a=party_abbr[i] t=party[i] if a not in party_imgs:party_imgs[a]={'name':t,'img':s} members.append({'name':name,'birth_ro':birth_ro,'idm':idm,'link':url,'leg':leg,'start':start, 'img':img,'pretty_name':pretty_name,'camera':camera,'judet':judet,'end':end, 'party_abbr':party_abbr,'party':party,'state':state,'state_abbr':state_abbr, 'activitate':activitate,'comisii':comisii,'comisii_abbr':comisii_abbr}) parsed_links.add(link) # In[137]: len(members), len(links) # In[138]: open('data/members.json','w').write(json.dumps({'members':members, 'party_imgs':party_imgs, 'state_imgs':state_imgs})) # In[139]: members=json.loads(open('data/members.json','r').read())['members'] party_imgs=json.loads(open('data/members.json','r').read())['party_imgs'] state_imgs=json.loads(open('data/members.json','r').read())['state_imgs'] # In[140]: save_obj(pages,'pages') # Clean # In[141]: legs={'2016':['2016-12-21','2019-07-01'], '2012':['2012-12-20','2016-12-20'], '2008':['2008-12-19','2012-12-19'], '2004':['2004-12-19','2008-12-13'], '2000':['2000-12-15','2004-11-30'], '1996':['1996-11-27','2000-11-30'], '1992':['1992-10-28','1996-11-22'], '1990':['1990-06-19','1992-10-16']} # Manual fix for tihs Monitorul official: # - http://www.cdep.ro/pls/legis/legis_pck.htp_act?ida=54223 # - appears as 17 feb 2004 # - http://www.cdep.ro/pls/parlam/structura2015.mp?idm=84&leg=2004&cam=1 # - should be 17 dec 2004 # In[142]: ro_months={'ian.':'Jan', ' ia ':' Jan ', 'feb.':'Feb', 'mar.':'Mar', 'apr.':'Apr', 'mai':'May', 'iun.':'Jun', ' iu ':' Jun ', 'iul.':'Jul', 'aug.':'Aug', 'sep.':'Sep', 'oct.':'Oct', 'noi.':'Nov', 'dec.':'Dec', 'ianuarie':'Jan', 'februarie':'Feb', 'martie':'Mar', 'aprilie':'Apr', 'mai':'May', 'iunie':'Jun', 'iulie':'Jul', 'august':'Aug', 'septembrie':'Sep', 'octombrie':'Oct', 'noiembrie':'Nov', 'decembrie':'Dec',} def date_ro(d,s=True): if 'prezent' in d: t=pd.to_datetime(legs['2016'][1]) else: ds=d.replace(')','').replace('(','').split(' ') x=ds[1].replace('0201','2001').replace('0092','1992').replace('0213','2013') t=pd.to_datetime(ro_months[ds[0].strip()]+' '+x) if s: return str(t)[:10] else: return t def date_ro2(d,s=False): for i in range(20): d=d.replace(' ',' ') ds=d.split(' ') t=pd.to_datetime(ds[0]+' '+ro_months[ds[1]]+' '+ds[2]) s=str(t)[:10] if s=='2004-02-17': s='2004-12-17' return s # In[143]: decess={'Ratiu Ion1917-2000':'6 Jun 1917', 'Popovici Dan Ion Cristian1946-1996':'26 Dec 1946', 'Palfi Mozes Zoltan1943-2011':'1 Jan 1943', 'Blaga Ionel1929-1994':'17 Mar 1929', 'Daraban Aurel1939-2004':'27 Sep 1939', 'Croitoru Mircea-Adrian1941-1999':'1 Jan 1941', 'Budeanu Radu1943-1997':'1 Jan 1943', 'Coposu Corneliu1914-1995':'20 May 1914', 'Ignat Miron1941-2018':'24 Aug 1941', 'Nastase Toma1932-1997':'1 Jan 1932', 'Musat Mircea1930-1994':'1 Jan 1930', 'Stoica Stefan1976-2014':'1 Jan 1976', 'Grama Mihail1924-1999':'1 Jan 1924', 'Bot Octavian1951-2015':'1 Jan 1951', #real 1 Jan 'Iorgovan Antonie1948-2007':'9 Aug 1948', 'Dinescu Valentin1955-2008':'25 Dec 1955', 'Babias Iohan-Peter1952-2002':'28 Jun 1952', 'Munteanu Mircea Mihai1933-1998':'26 May 1933', 'Timis Ioan1951-2010':'17 Sep 1951', 'Barbu Eugen1924-1993':'1 Jan 1924', 'Tcaciuc Stefan1936-2005':'13 Jan 1936', 'Mircovici Niculae1950-2016':'1 Oct 1950', 'Rusu Horia Mircea1952-2001':'18 Sep 1952', 'Racoceanu Viorel1962-2006':'8 Jun 1962', 'Andrei Zeno1935-2001':'1 Jan 1935', 'Surdu-Soreanu Raul-Victor1947-2011':'11 Jul 1947', 'Dan Iosif1950-2007':'14 Oct 1950', 'Dutu Ion1942-2000':'7 Oct 1942', 'Bindea Liviu-Doru1957-2006':'26 Jul 1957', 'Verestoy Attila1954-2018':'1 Mar 1954', 'Dragomir Nelu Aristide1957-1995':'13 Oct 1957', 'Micle Ulpiu-Radu-Sabin1935-2000':'1 Jan 1935', 'Cojocariu Emil1938-1994':'2 Dec 1938', 'Policrat Rene-Radu1910-1993':'12 Aug 1910', 'Serban Gheorghe1954-1998':'25 Jun 1954', 'Vladoiu Aurel1948-2015':'27 Jan 1948', 'Preda Ion1947-2007':'1 Jan 1947', 'Coste Marina-Adelina1965-2017':'30 Nov 1965', 'Sincai Ovidiu1949-1999':'14 Dec 1949', 'Grosaru Mircea1952-2014':'30 Jun 1952', 'Florescu Nicolae-Doru1960-2001':'1 jan 1960', 'Sinko Stefan1939-1995':'1 Jan 1939', 'Alecsandrescu Nicolae1923-1993':'1 Jan 1923', 'Ratoi Neculai1939-2016':'15 Mar 1939', 'Ichim Mircea-Adrian1944-1993':'1 Jan 1944', 'Fotopolos Sotiris1937-2008':'6 Dec 1937', 'Ciobanu Gheorghe1964-2015':'22 Sep 1964', 'Dumitrescu Liana1973-2011':'20 Jan 1973', 'Dida Corneliu Ioan1942-2008':'26 May 1942', 'Draghici Sonia-Maria1956-2016':'25 Jul 1956'} # In[144]: state_abbs={'membru':'','\xa0':'','supleant':''} # In[145]: def replace_all(text, dic): for i in dic: text = text.replace(i, dic[i]) return text # In[146]: def president(s): abb='' #reset pres if 'Vicepreşedinte' in s: s=s.replace('Vicepreşedinte','') abb='#VP#' elif 'vicepreşedinte' in s: s=s.replace('vicepreşedinte','') abb='#VP#' elif 'preşedinte' in s: s=s.replace('preşedinte','') abb='#PRES#' elif 'Preşedinte' in s: s=s.replace('Preşedinte','') abb='#PRES#' elif 'Secretar' in s: s=s.replace('Secretar','') abb='#SECR#' elif 'secretar' in s: s=s.replace('secretar','') abb='#SECR#' elif 'Trezorier' in s: s=s.replace('Trezorier','') abb='#TREZ#' elif 'trezorier' in s: s=s.replace('trezorier','') abb='#TREZ#' elif 'chestor' in s: s=s.replace('chestor','') abb='#CHES#' elif 'Chestor' in s: s=s.replace('Chestor','') abb='#CHES#' elif 'Şeful' in s: s=s.replace('Şeful','') abb='#SEF#' return s.strip(),abb # In[147]: gpresident={'Vicelider':'$#VP#%', 'vicelider':'$#VP#%', 'Secretar':'$#SECR#%', 'secretar':'$#SECR#%', 'Lider':'$#PRES#%', 'Purtător de cuvânt':'$#PRCV#%', 'lider':'$#PRES#%'} # In[148]: gpresident2={'Vicepreşedinte':'$#VP#%', 'vicepreşedinte':'$#VP#%', 'preşedinte':'$#PRES#%', 'Preşedinte':'$#PRES#%', 'Secretar':'$#SECR#%', 'secretar':'$#SECR#%'} # In[149]: nmembers={} nparty_imgs={} nstate_imgs={} party_set=set() country_set=set() comisii_set=set() deaths={} for i in members[:]: if '-' not in i['birth_ro']: birth=str(pd.to_datetime(replace_all(i['birth_ro'],ro_months))) death='' else: birth=str(pd.to_datetime(replace_all(i['name']+i['birth_ro'],decess))) death=i['birth_ro'].split('-')[1].strip() if len(birth)>3: name=i['name']+' | '+birth[:10] else: name=i['name']+' | Ismeretlen' if death: if name not in deaths: deaths[name]=death since=str(pd.to_datetime(legs[i['leg']][0]))[:10] until=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since=date_ro2(i['start'],True) if i['end']: until=date_ro2(i['end']) pretty_name,pres=president(i['pretty_name']) if name not in nmembers: nmembers[name]={} if 'Name' not in nmembers[name]: nmembers[name]['Name']={'full':i['pretty_name'],'simple':name,'short':i['name']} if 'Photo' not in nmembers[name]: nmembers[name]['Photo']=[] nmembers[name]['Photo'].append(base_url+i['img']) if 'UserID' not in nmembers[name]: nmembers[name]['UserID']=[] nmembers[name]['UserID'].append(i['idm']) if 'Camera' not in nmembers[name]: nmembers[name]['Camera']=[] dummy,pres=president(i['camera']) if not pres: dummy,pres=president(i['pretty_name']) nmembers[name]['Camera'].append(pres+i['camera']) if 'Starts' not in nmembers[name]: nmembers[name]['Starts']=[] nmembers[name]['Starts'].append(since) if 'Ends' not in nmembers[name]: nmembers[name]['Ends']=[] nmembers[name]['Ends'].append(until) if 'Link' not in nmembers[name]: nmembers[name]['Link']=[] nmembers[name]['Link'].append(i['link']) if 'Parties' not in nmembers[name]: nmembers[name]['Parties']=[] parties=i['party'] parties_abbr=i['party_abbr'] pi=0 while pi<(len(parties)): p=parties[pi].strip() p,pres=president(p) #reset since since=str(pd.to_datetime(legs[i['leg']][0]))[:10] until=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since=date_ro2(i['start'],True) if i['end']: until=date_ro2(i['end']) #process if '#GRUP#' not in p: if '-' not in p: if p!='independent': abbr='Minorități' party=p else: abbr='Independent' party='Independent' else: abbr=parties_abbr[pi] party=p.split('-')[1].strip() if ('din ') in party: if ('din R') not in party: if ('din B') not in party: since=max(since,date_ro(party[party.find('din ')+4:].strip(),True)) party='Independent' #else: if True: #keep, for cases with both 'din' and 'pana' if len(p.split('-'))>2: s=''.join(p.split('-')[2:]).strip() if ('din ') in s: if ('din R') not in s: since=max(since,date_ro(s[s.find('din ')+4:].strip(),True)) if ('până în ') in s: until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True)) if abbr in ['independent','','Neafiliaţi']: abbr='Independent' nmembers[name]['Parties'].append({'party':pres+abbr, 'start':since,'end':until,'judet':i['judet']}) party_set.add(abbr) else: p=parties[pi].strip() if 'se transf' not in p: abbr=p.split('\xa0')[0] p=replace_all(p,gpresident) for pk in p.split('$'): kabbr=pk.replace('%',abbr).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip() pabbr=kabbr[:kabbr.find(abbr)+len(abbr)].strip() s=kabbr[kabbr.find(abbr)+len(abbr):].strip() if ('din ') in s: since=max(since,date_ro(s[s.find('din ')+4:].strip(),True)) if ('până în ') in s: until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True)) nmembers[name]['Parties'].append({'party':pabbr, 'start':since,'end':until,'judet':i['judet']}) party_set.add(pabbr) else: p0=parties[min(pi+2,len(parties)-1)].strip() p1=parties[pi+1].strip() abbr0=p0.split('\xa0')[0] abbr1=p1.split(' în ')[0] since2=date_ro(p1.split(' în ')[1].replace('-','. ').replace('fost','')) p0=replace_all(p0,gpresident) for pk in p0.split('$'): kabbr0=pk.replace('%',abbr0).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip() pabbr0=kabbr0[:kabbr0.find(abbr0)+len(abbr0)].strip().split('(')[0].strip() s=kabbr0[kabbr0.find(abbr0)+len(abbr0):].strip() if ('din ') in s: since=max(since,date_ro(s[s.find('din ')+4:].strip(),True)) if ('până în ') in s: until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True)) nmembers[name]['Parties'].append({'party':pabbr0, 'start':since,'end':since2,'judet':i['judet']}) nmembers[name]['Parties'].append({'party':abbr1, 'start':since2,'end':until,'judet':i['judet']}) party_set.add(pabbr0) party_set.add(abbr1) pi+=2 pi+=1 if 'Countries' not in nmembers[name]: nmembers[name]['Countries']=[] states=i['state'] states_abbr=i['state_abbr'] for pi in range(len(states)): p=states[pi].strip() p,pres=president(p) cont=True if '#INTER#' in p: cont=False if 'Bucureşti - Chişinău' in p:cont=True if 'elega' in p:cont=True if 'Europol' in p:cont=True if cont: abb_state=state_format(states_abbr[pi]) #reset since since=str(pd.to_datetime(legs[i['leg']][0]))[:10] until=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since=date_ro2(i['start'],True) if i['end']: until=date_ro2(i['end']) #process if '-' in p: state=p.split('-')[0] s=replace_all(''.join(p.split('-')[1:]).strip(),state_abbs).strip() if ('din ') in s: if ('din R') not in s: since=max(since,date_ro(s[s.find('din ')+4:].strip(),True)) if ('până în ') in s: until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True)) nmembers[name]['Countries'].append({'country':pres+abb_state,'start':since,'end':until}) country_set.add(abb_state) if 'Groups' not in nmembers[name]: nmembers[name]['Groups']=[] comisii=i['comisii'] comisii_abbr=i['comisii_abbr'] for pi in range(len(comisii_abbr)): p=comisii[pi].strip() abbr=comisii_abbr[pi].strip() if '#BP#' in p: p=replace_all(p,state_abbs) p,pres=president(p) if '#BP#Atribu' not in p: #reset since since=str(pd.to_datetime(legs[i['leg']][0]))[:10] until=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since=date_ro2(i['start'],True) if i['end']: until=date_ro2(i['end']) if '-' in p: s0=p.replace('#BP#','').replace('în sesiunea parlamentară:','').replace('atributii','') s=s0.split('-') if len(s[0])<6: s[0]=s[0]+s[1].strip()[-4:] since=max(since,date_ro(s[0].strip(),True)) until=min(until,date_ro(s[1].strip(),True)) if ('din ') in s0: if ('din sumele') not in s: since=max(since,date_ro(s0[s0.find('din ')+4:].strip(),True)) if ('până în ') in s: until=min(until,date_ro(s0[s0.find('până în ')+8:].strip(),True)) nmembers[name]['Groups'].append({'group':pres+abbr, 'start':since,'end':until}) comisii_set.add(abbr) else: #process p=replace_all(p,gpresident2) if '$' not in p: since=str(pd.to_datetime(legs[i['leg']][0]))[:10] until=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since=date_ro2(i['start'],True) if i['end']: until=date_ro2(i['end']) kabbr=p.replace(' ',' ').replace(' ',' ').replace(' ',' ').strip() pabbr=kabbr[:kabbr.find(abbr)+len(abbr)].strip() s=kabbr[kabbr.find(abbr)+len(abbr):].strip() if ('din ') in s: if ('din sumele') not in s: since=max(since,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True)) if ('până în ') in s: until=min(until,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True)) nmembers[name]['Groups'].append({'group':pabbr, 'start':since,'end':until}) comisii_set.add(pabbr) else: pk=p.split('$')[0] since0=str(pd.to_datetime(legs[i['leg']][0]))[:10] until0=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since0=date_ro2(i['start'],True) if i['end']: until0=date_ro2(i['end']) kabbr=pk.replace('%',abbr).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip() pabbr0=kabbr[:kabbr.find(abbr)+len(abbr)].strip() s=kabbr[kabbr.find(abbr)+len(abbr):].strip() if ('din ') in s: if ('din sumele') not in s: since0=max(since0,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True)) if ('până în ') in s: until0=min(until0,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True)) if ('-') in s: if len(s.split('-')[-1])>4: s0=s.split('-')[0].strip() s1=s.split('-')[1].strip() if len(s0)<6: s0=s0+' '+s1.split(' ')[1].strip() since0=max(since0,date_ro(s0,True)) until0=min(until0,date_ro(s1,True)) pk=p.split('$')[1] since1=str(pd.to_datetime(legs[i['leg']][0]))[:10] until1=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since1=date_ro2(i['start'],True) if i['end']: until1=date_ro2(i['end']) kabbr=pk.replace('%',abbr).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip() pabbr1=kabbr[:kabbr.find(abbr)+len(abbr)].strip() s=kabbr[kabbr.find(abbr)+len(abbr):].strip() if ('din ') in s: if ('din sumele') not in s: since1=max(since1,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True)) if ('până în ') in s: until1=min(until1,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True)) if ('-') in s: if len(s.split('-')[-1])>4: s0=s.split('-')[0].strip() s1=s.split('-')[1].strip() if len(s0)<6: s0=s0+' '+s1.split(' ')[1].strip() since1=max(since1,date_ro(s0,True)) until1=min(until1,date_ro(s1,True)) since1=max(since0,since1) until1=min(until0,until1) if since0==since1 and until0==until1: nmembers[name]['Groups'].append({'group':pabbr1, 'start':since1,'end':until1}) comisii_set.add(pabbr1) elif since0until1: nmembers[name]['Groups'].append({'group':pabbr0, 'start':since0,'end':since1}) comisii_set.add(pabbr0) nmembers[name]['Groups'].append({'group':pabbr1, 'start':since1,'end':until1}) comisii_set.add(pabbr1) nmembers[name]['Groups'].append({'group':pabbr0, 'start':until1,'end':until0}) comisii_set.add(pabbr0) elif until1since0: nmembers[name]['Groups'].append({'group':pabbr0, 'start':since0,'end':since1}) comisii_set.add(pabbr0) nmembers[name]['Groups'].append({'group':pabbr1, 'start':since1,'end':until0}) comisii_set.add(pabbr1) #reset since since=str(pd.to_datetime(legs[i['leg']][0]))[:10] until=str(pd.to_datetime(legs[i['leg']][1]))[:10] if i['start']: since=date_ro2(i['start'],True) if i['end']: until=date_ro2(i['end']) #process if 'Activity' not in nmembers[name]: nmembers[name]['Activity']={} activitate=i['activitate'] for a in activitate: if len(a)>1: an=a[0] if an not in ['Membru în']: av=int(a[1].split('(')[0].strip().split(',')[0].strip().split('-')[0].strip()) if an not in nmembers[name]['Activity']:nmembers[name]['Activity'][an]=[] nmembers[name]['Activity'][an].append({'value':av,'start':since,'end':until}) if an=='Luari de cuvânt': if 'în ' in a[1]: av=int(a[1].split('(în ')[-1].strip().split('sedinte)')[0].strip()) nmembers[name]['Activity']['Sedințe']={'value':av,'start':since,'end':until} # In[150]: # nmembers['Popescu-Tariceanu Calin Constantin Anton | 1952-01-14'] # In[151]: # nmembers['Stanisoara Mihai | 1962-06-11'] # In[152]: #nmembers['Verestoy Attila | 1954-03-01']['Parties'] # In[153]: #nmembers['Turcan Raluca | 1976-04-02']['Parties'] # In[154]: len(nmembers),len(members),len(deaths) # In[155]: open('data/nmembers.json','w').write(json.dumps(nmembers)) # In[156]: open('data/deaths.json','w').write(json.dumps(deaths)) # No need to re-run # In[38]: def party_cleaner(i,p): if i==p: return p p=p[len(i)+1:].split('-')[0].strip() return p # In[39]: #! run once for i in party_set.difference(set(party_imgs.keys())): party_imgs[i]={'name': i, 'img': ''} for i in set(party_imgs.keys()).difference(party_set): party_imgs.pop(i) for i in party_imgs: party_imgs[i]['name']=party_cleaner(i,party_imgs[i]['name']) # In[40]: nstate_imgs={} # In[41]: for i in state_imgs: nstate_imgs[state_format(i)]=state_imgs[i] nstate_imgs['']='' # In[42]: #img=json.loads(open('data/img.json','r').read()) img={'party':party_imgs,'state':nstate_imgs} # In[43]: #manual img['party']['PP-DD']['name']='Partidul Poporului Dan Diaconescu' img['party']['PUR-SL']['name']='Partidul Umanist din România' img['party']['RMDSZ']=img['party']['UDMR'] img['party']['RMDSZ']['name']='Romániai Magyar Demokrata Szövetség' # In[44]: open('data/img.json','w').write(json.dumps(img)) # In[ ]: