In [79]:
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
In [2]:
df1=pd.read_csv('Dataset1_Syria Archive.csv',header=None,names=['URL','title','summary','date','time','location','weapon','collection','violation'])
df2 = pd.read_csv('Dataset2_Violations Documentation Center in Syria.csv', header=None, names=['name', 'status', 'sex', 'province', 'area','date', 'cause of death','actors'])
In [46]:
df1[:2]
Out[46]:
URL title summary date time location weapon collection violation location_clean
0 https://syrianarchive.org/en/database?after=20... Russian aircrafts commit a massacre in Kafar H... Russian aircrafts commit a massacre in Kafar H... 2016-02-27 3:16:24 ALEPPO : Kafr hamrah NaN Civilian casualties as a result of alleged rus... Unlawful attacks ALEPPO
1 https://syrianarchive.org/en/database?after=20... #Witness: 8 killed by Russian airstrikes on th... #Witness: 8 killed by Russian airstrikes on th... 2016-02-27 6:03:25 ALEPPO : Kafr hamrah NaN Civilian casualties as a result of alleged rus... Unlawful attacks ALEPPO
In [47]:
df2[:2]
Out[47]:
name status sex province area date cause of death actors
0 Abdulla Riyad Hammash Non-Civilian Adult - Male Idlib Jisr Shagour: Najieh 2016-02-26 Shooting NaN
1 Muhannad Abo Wiliam Non-Civilian Adult - Male Damascus Suburbs Daraya 2016-02-26 Shooting NaN

Part 1 Data Analysis

Clean and analyse locations in the first dataset from Syrian Archive

In [45]:
def clean_location(r):
    return str(r).split(':')[0]
df1['location_clean']=df1['location'].apply(clean_location)
df1['location_clean'].apply(str).value_counts()
Out[45]:
ALEPPO       1920
IDLIB         219
HAMA          103
DAMASCUS       97
nan            41
HOMS           39
DARAA           9
LATTAKIA        2
حلب             1
Name: location_clean, dtype: int64
In [125]:
#replace "حلب" with "missing value"
df_media_location=pd.DataFrame({'location':['Aleppo','Idlib','Hama','Damascus','NaN','Homs','Daraa','Lattakia','missing value'],'number':[1920,219,103,97,41,39,9,2,1]})
#calculate the distribution of locations
total=0
percentage=[]
for i in df_media_location['number']:
    total=total+i
for i in df_media_location['number']:
    percentage.append(i/total)
df_media_location['percentage']=pd.Series(percentage)

Clean and analyse death causes in the first dataset from Syrian Archive

In [11]:
death_cause=df1['collection'].value_counts()
death_cause
Out[11]:
Civilian casualties as a result of alleged russian attacks                               230
Chemical weapons                                                                         139
Attacks against hospitals                                                                 92
Chemical Weapons                                                                          11
Attacks against schools                                                                    9
Attacks against hospitals, Chemical weapons                                                8
Attacks against humanitarian relief personnel and objects                                  5
Civilian casualties as a result of alleged russian attacks, Attacks against hospitals      3
Attacks against hospitals, Attacks against humanitarian relief personnel and objects       2
Attacks against bakeries                                                                   1
Attacks against journalists                                                                1
Name: collection, dtype: int64
In [35]:
death_cause_media=pd.DataFrame({'cause':['Civilian casualties as a result of alleged russian attacks',
                                         'Chemical weapons','Attacks against hospitals',
                                            'Attacks against schools',
                                            'Attacks against humanitarian relief personnel and objects',
                                            "Attacks against bakeries",
                                            "Attacks against journalists"],
                                'number':[233,158,105,9,7,1,1]})
#calculate the distribution of locations
total=0
percentage=[]
for i in death_cause_media['number']:
    total=total+i
for i in death_cause_media['number']:
    percentage.append(i/total)
death_cause_media['percentage']=pd.Series(percentage)

Clean and analyse locations in the second dataset from Violation Documentation Center in Syria

In [82]:
df_fact_location=df2['province'].value_counts().to_frame()
df_fact_location=df_fact_location.reset_index()
df_fact_location.columns=['location','number']
total=0
percentage=[]
for i in df_fact_location['number']:
    total=total+i
for i in df_fact_location['number']:
    percentage.append(i/total)
df_fact_location['percentage']=pd.Series(percentage)
df_fact_location
Out[82]:
location number percentage
0 Aleppo 7990 0.260660
1 Damascus Suburbs 6372 0.207875
2 Idlib 4434 0.144651
3 Deir Ezzor 2904 0.094738
4 Homs 2304 0.075164
5 Daraa 2231 0.072782
6 Hama 1856 0.060549
7 Raqqa 1327 0.043291
8 Damascus 601 0.019607
9 Other Nationalities 139 0.004535
10 Quneitra 121 0.003947
11 Hasakeh 105 0.003425
12 Lattakia 103 0.003360
13 Unknown 83 0.002708
14 Tartous 48 0.001566
15 Sweida 35 0.001142
In [83]:
#As shown above, "Damascus Suburbs" can be merged into "Damascus".
df_fact_location['number'][8]=df_fact_location['number'][8]+df_fact_location['number'][1]
df_fact_location['percentage'][8]=df_fact_location['percentage'][8]+df_fact_location['percentage'][1]
df_fact_location=df_fact_location.drop(1)
/Users/p17417864/Desktop/venv/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/Users/p17417864/Desktop/venv/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

Clean and analyse death causes in the second dataset from Violation Documentation Center in Syria

In [33]:
df2['cause of death'].value_counts()
Out[33]:
Warplane shelling                   12926
Shooting                             8348
Shelling                             4789
Explosion                            2353
Field Execution                       487
Detention - Torture                   465
Kidnapping - Execution                387
Unknown                               257
Chemical and toxic gases              226
Other                                 120
Detention - Execution                 117
Siege                                  93
Un-allowed to seek Medical help        39
Kidnapping - Torture                   31
Kidnapping - Torture - Execution        9
Detention - Torture - Execution         6
Name: cause of death, dtype: int64

Merge "Detention - Torture", "Kidnapping - Execution", "Detention - Execution", "Kidnapping - Torture", "Kidnapping - Torture - Execution" ,"Detention - Torture - Execution" into "mixed"

In [122]:
death_cause_fact=pd.DataFrame({'cause':['Warplane shelling', 
                                        'Shooting', 
                                        'Shelling', 
                                        'Explosion',
                                        'Field Execution', 
                                        'Unknown', 
                                        'Chemical and toxic gases', 
                                        'Other', 
                                        'Siege',
                                        'Un-allowed to seek Medical help',
                                       'mixed'],
                                'number':[12926,8348,4789,2353,487,257,226,120,93,39,976]})
In [123]:
total=0
percentage=[]
for i in death_cause_fact['number']:
    total=total+i
for i in death_cause_fact['number']:
    percentage.append(i/total)
death_cause_fact['percentage']=pd.Series(percentage)

Analyse casualties of civilian and non-civilian caused by different actors

In [102]:
emmm=df2['actors'].value_counts()
emmm
Out[102]:
Syrian government and affiliated militias                              15580
Russian troops                                                          3808
The organization of the Islamic State in Iraq and the Levant - ISIS     3102
Not identified                                                          2696
International coalition forces                                          2059
Armed opposition groups                                                 1634
Self administration forces                                               582
Al-Nusra Front                                                           112
Name: actors, dtype: int64

Apart from "Not identified", we chose to casualties of civilian and non-civilian caused by top 5 actors, namely "Syrian government and affiliated militias", "Russian troops", "ISIS", "International coalition forces", "Armed opposition groups".

In [91]:
df2[df2['actors']=='Syrian government and affiliated militias']['status'].value_counts()
Out[91]:
Civilian        10324
Non-Civilian     5256
Name: status, dtype: int64
In [92]:
df2[df2['actors']=='Russian troops']['status'].value_counts()
Out[92]:
Civilian        3651
Non-Civilian     157
Name: status, dtype: int64
In [93]:
df2[df2['actors']=='The organization of the Islamic State in Iraq and the Levant - ISIS']['status'].value_counts()
Out[93]:
Civilian        1950
Non-Civilian    1152
Name: status, dtype: int64
In [94]:
df2[df2['actors']=='International coalition forces']['status'].value_counts()
Out[94]:
Civilian        1905
Non-Civilian     154
Name: status, dtype: int64
In [95]:
df2[df2['actors']=='Armed opposition groups']['status'].value_counts()
Out[95]:
Civilian        821
Non-Civilian    813
Name: status, dtype: int64
In [100]:
actors_status=pd.DataFrame({'actors':['Syrian government and affiliated militias', 
                                        'Syrian government and affiliated militias', 
                                        'Russian troops', 
                                        'Russian troops',
                                        'ISIS', 
                                        'ISIS', 
                                        'International coalition forces', 
                                        'International coalition forces', 
                                        'Armed opposition groups',
                                       'Armed opposition groups'],
                               'status':['Civilian','Non-Civilian',
                                        'Civilian','Non-Civilian',
                                        'Civilian','Non-Civilian',
                                        'Civilian','Non-Civilian',
                                        'Civilian','Non-Civilian'],
                                'number':[10324,5256,3651,157,1950,1152,1905,154,821,813]})
In [105]:
percentage=[]
for i in range(5):
    percentage.append(actors_status['number'][2*i]/emmm.values[i])
    percentage.append(actors_status['number'][2*i+1]/emmm.values[i])
actors_status['percentage']=pd.Series(percentage)

Part 2 Data Visualisation

Comparing location distribution in two datasets

In [ ]:
df_fact_location=df_fact_location.sort_values(by='percentage',ascending=True)[7:15]
In [126]:
fig=plt.figure(figsize=(7,3.5))
plt.barh(df_fact_location['location'].values,df_fact_location['percentage'].apply(float).values,
        color=['#cccccc','#cccccc','#cccccc','#cccccc','#b30000','#b30000','#b30000','#b30000'])
plt.title('location distribution in the second dataset',fontsize=20)
plt.xlim(range(2))
plt.savefig('location_fact.png')
Out[126]:
(0, 1)
In [127]:
fig=plt.figure(figsize=(7,3.5))
plt.barh(df_fact_location['location'].values,[0,0.042369,0.003702,0.016043,0,0.090086,0.039901,0.789798],
         color=['#cccccc','#b30000','#cccccc','#cccccc','#cccccc','#b30000','#b30000','#b30000'])
plt.title('location distribution in the first dataset',fontsize=20)
plt.xlim(range(2))
plt.savefig('location_media.png')
Out[127]:
(0, 1)

Use photoshop to refine the two graphs, then we get:

Compare and Visualize death causes in two datasets

In [124]:
death_cause_fact
Out[124]:
cause number percentage
0 Warplane shelling 12926 0.422225
1 Shooting 8348 0.272686
2 Shelling 4789 0.156432
3 Explosion 2353 0.076860
4 Field Execution 487 0.015908
5 Unknown 257 0.008395
6 Chemical and toxic gases 226 0.007382
7 Other 120 0.003920
8 Siege 93 0.003038
9 Un-allowed to seek Medical help 39 0.001274
10 mixed 976 0.031881
In [114]:
death_cause_media
Out[114]:
cause number percentage
0 Civilian casualties as a result of alleged rus... 233 0.453307
1 Chemical weapons 158 0.307393
2 Attacks against hospitals 105 0.204280
3 Attacks against schools 9 0.017510
4 Attacks against humanitarian relief personnel ... 7 0.013619
5 Attacks against bakeries 1 0.001946
6 Attacks against journalists 1 0.001946

Use R Language and Photopshop to visualize the results:

Visualize casualties of civilian and non-civilian caused by different actors

In [106]:
actors_status
Out[106]:
actors number status percentage
0 Syrian government and affiliated militias 10324 Civilian 0.662644
1 Syrian government and affiliated militias 5256 Non-Civilian 0.337356
2 Russian troops 3651 Civilian 0.958771
3 Russian troops 157 Non-Civilian 0.041229
4 ISIS 1950 Civilian 0.628627
5 ISIS 1152 Non-Civilian 0.371373
6 International coalition forces 1905 Civilian 0.706602
7 International coalition forces 154 Non-Civilian 0.057122
8 Armed opposition groups 821 Civilian 0.398737
9 Armed opposition groups 813 Non-Civilian 0.394852
In [128]:
fig=plt.figure(figsize=(7,5))
d=np.arange(5)
y1=actors_status['percentage'][d*2]
plt.bar(d,y1,label='Civilian',color='#b30000')
plt.bar(d,actors_status['percentage'][d*2+1],bottom=y1,label='Non Civilian',color='#b3b3b3')
plt.xticks(range(6),actors_status['actors'][d*2])
plt.savefig('russia_citizen.png')
Out[128]:
([<matplotlib.axis.XTick at 0x1106ee0b8>,
  <matplotlib.axis.XTick at 0x11061b9b0>,
  <matplotlib.axis.XTick at 0x11061b710>,
  <matplotlib.axis.XTick at 0x110791710>,
  <matplotlib.axis.XTick at 0x110791be0>,
  <matplotlib.axis.XTick at 0x11061b438>],
 <a list of 5 Text xticklabel objects>)

Use photoshop to refine the graph, then we get:

Visualize the location distribution of casualties caused by Russian troops

In [112]:
df2[df2['actors']=='Russian troops']['province'].value_counts()
Out[112]:
Aleppo              1554
Idlib                754
Deir Ezzor           639
Hama                 343
Damascus Suburbs     258
Homs                 129
Raqqa                 91
Daraa                 30
Damascus               8
Lattakia               2
Name: province, dtype: int64

Use photoshop to visualize the results, then we get:

Other aspects mentioned in the story

In [110]:
df1[df1['collection']=='Civilian casualties as a result of alleged russian attacks']['violation'].value_counts()
Out[110]:
Unlawful attacks          220
Use of illegal weapons     10
Name: violation, dtype: int64
In [ ]:
 
In [ ]:
 
In [ ]: