This is the final data output from the SLR

there is a difference between what we are reporting and the results form the SLR

  1. We droped 84 records, for a total of 3'898 pieces
  2. They had no reported area, so we cannot calculate density
  3. However in terms pf pieces per meter -- all beaches were included

Scroll through to see how we got here

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle
import re
import os
from scipy.stats import norm
import scipy.stats
In [2]:
here = os.getcwd()
data = here + '/data'
In [3]:
# Get the treated data from project manager
In [4]:
a = pd.read_excel(data + '/slr_final.xlsx', sheet_name='SLR_Data', header=0, encoding='latin1')
In [5]:
# Get the existing location information
# this includes city, post code, river or lake etc...
# retrieved from current db for app
In [6]:
bch = pd.read_csv('/home/mw-shovel/Downloads/slr_beaches.csv', header=None)
In [7]:
bch[:1]
Out[7]:
0 1 2 3 4 5 6 7
0 aarezufluss_bern_scheurerk 46.97101 7.45279 Bern 3004 r SLR Aare
In [8]:
# compare the location names
# see if there were any added by the pm since the last upload
In [9]:
nme_p = list(a['BeachName'].unique())
nme_h = list(bch[0].unique())
dif = [x for x in nme_p + nme_h if x not in nme_p or x not in nme_p]
dif
Out[9]:
[]
In [10]:
# okay this is empty
# just to make it easier to understand what I did 
In [11]:
# just to make sure:
def same_names(a,b):
    for c in a:
        if c not in b:
            print(c)
same_names(nme_h, nme_p)
In [12]:
# okay thats all good
In [13]:
# assign appropriate column names to the locations data table
In [14]:
bch.rename(columns={0:'Location', 1:'latitude', 2:'longitude', 3:'city', 4:'post', 5:'project'}, inplace=True)
In [15]:
# okay now reduce the slr_final data to what we need
# take a look at the columns
a_cs = a.columns
In [16]:
a_cs[:20]
Out[16]:
Index(['BeachName', 'lat_y1', 'lon_x1', 'lat_y2', 'lon_x2', 'BeachLength_m',
       'EventDate', 'G3', 'G4', 'G7', 'G8', 'G10', 'G13', 'G21', 'G22', 'G23',
       'G24', 'G25', 'G26', 'G27'],
      dtype='object')
In [17]:
new_cols =['BeachName','BeachLength_m','EventDate','G3','G4','G7','G8','G10','G13','G21','G22','G23',
           'G24','G25','G26','G27','G28','G30','G31','G32','G33','G34','G35','G37','G49','G50','G52',
           'G59','G65','G66','G67','G70','G71','G79','G80','G82','G83','G87','G89','G90','G91','G93',
           'G95','G96','G99','G100','G101','G124','G125','G126','G128','G131','G133','G134','G137','G138',
           'G142','G144','G145','G147','G151','G152','G153','G154','G155','G156','G158','G159','G160','G165',
           'G167','G171','G172','G175','G176','G177','G178','G179','G182','G188','G191','G194','G195','G198',
           'G199','G200','G203','G204','G208','G210','G211','G213','TOTAL','Area','DENSITY']
In [18]:
a_1 = a[new_cols].copy()
In [19]:
obs_v = a[['BeachName','Percent_WashedAshore']].copy()
In [20]:
# Need to identify all records where area = 0
# This will give a 'undefined' value for 'DENSITY'
In [21]:
a_1.fillna(0, inplace=True)
In [22]:
no_area = a_1.loc[a_1.Area == 0].copy()
no_area.to_csv('data/no_area.csv')
In [23]:
a_1.rename(columns={'BeachName':'location', 'BeachLength_m':'length', 'EventDate':'date'}, inplace=True)
In [24]:
# okay now we need to drop the records that have an area = 0
# notice I keep the original dataframe and make a new one
In [25]:
a_2 = a_1.loc[a_1.Area > 0].copy()
In [26]:
code_list = ['G3','G4','G7','G8','G10','G13','G21','G22','G23',
             'G24','G25','G26','G27','G28','G30','G31','G32','G33','G34','G35','G37','G49','G50','G52',
           'G59','G65','G66','G67','G70','G71','G79','G80','G82','G83','G87','G89','G90','G91','G93',
           'G95','G96','G99','G100','G101','G124','G125','G126','G128','G131','G133','G134','G137','G138',
           'G142','G144','G145','G147','G151','G152','G153','G154','G155','G156','G158','G159','G160','G165',
           'G167','G171','G172','G175','G176','G177','G178','G179','G182','G188','G191','G194','G195','G198',
           'G199','G200','G203','G204','G208','G210','G211','G213',]
In [27]:
def make_item_counts_pcm():
    d = []
    for a, b in a_1.iterrows():
        for code in code_list:
            if b[code] > 0:
                c = {'location':b['location'],'date':b['date'].strftime("%Y-%m-%d"), 'length':b['length'], 'quantity':b[code], 'density': b[code]/b['length'], 'code':code}
                d.append(c)
    return d
a_per_item_pcm = make_item_counts_pcm()
In [28]:
slr_data = pd.DataFrame(a_per_item_pcm)
In [29]:
slr_data = slr_data[['location','date','length','quantity','density','code']]
In [30]:
slr_data['density'] = slr_data['density'].round(4)
slr_data['quantity'] = slr_data['quantity'].astype(int)
In [31]:
slr_data.set_index('location', inplace=True)
In [32]:
slr_data.to_csv('data/slr_data.csv')
In [33]:
def make_item_counts_area():
    d = []
    for a, b in a_2.iterrows():
        for code in code_list:
            if b[code] > 0:
                c = {'location':b['location'],'date':b['date'].strftime("%Y-%m-%d"), 'area':b['Area'], 'quantity':b[code], 'density': b[code]/b['Area'], 'code':code}
                d.append(c)
    return d
a_per_item_area = make_item_counts_area()
In [34]:
# get the total of all the items found
# then get the daily density of all items per location per date
In [35]:
a_2['total']=a_2[code_list].sum(axis=1)
a_1['total']=a_1[code_list].sum(axis=1)
a_1 = a_1[a_1.total > 0].copy()
In [36]:
# the remote database has density as the column name for pcs/m
# will keep as is and name the total/area column as density2
# saves alot of work, and that can be changed later
a_1['density'] = a_1['total']/a_1['length']
a_2['density2'] = a_2['total']/a_2['Area']
In [37]:
slr_dens_date = a_1[['location', 'date', 'density', 'total']].copy()
slr_dens_date2 = a_2[['location', 'date', 'density2', 'total']].copy()
In [38]:
def change_name(df):
    df.rename(columns={'total':'quantity'})
change_name(slr_dens_date)
In [39]:
change_name(slr_dens_date2)
In [40]:
def get_sample_no(df):
    d = []
    for name in nme_p:
        a = df.loc[df.location == name]
        a = a.sort_values(by='date')
        a.reset_index(inplace=True)
        for i, row in a.iterrows():
            n=i+1
            c = {'location':row['location'], 'date':row['date'], 'sample':n, 'density2':row['density2'], 'quantity':row['total']}
            d.append(c)
    return d
slr_d_d2 = get_sample_no(slr_dens_date2)           
In [41]:
slr_area_samps = pd.DataFrame(slr_d_d2)
In [42]:
slr_area_samps = slr_area_samps[['location', 'date', 'sample', 'density2', 'quantity']]
In [43]:
slr_area_samps['date'] = slr_area_samps['date'].dt.strftime("%Y-%m-%d")
In [44]:
slr_area_samps.set_index('location', inplace=True)
In [45]:
slr_area_samps = slr_area_samps[['date', 'sample', 'density2', 'quantity']]
In [46]:
slr_area_samps['density2'] = slr_area_samps['density2'].round(3)
slr_area_samps['quantity'] = slr_area_samps['quantity'].astype(int)
slr_area_samps['sample'] = slr_area_samps['sample'].astype(int)
In [47]:
slr_area_samps.to_csv('data/slr_area.csv')
In [48]:
def get_sample_no(df):
    d = []
    for name in nme_p:
        a = df.loc[df.location == name]
        a = a.sort_values(by='date')
        a.reset_index(inplace=True)
        for i, row in a.iterrows():
            n=i+1
            c = {'location':row['location'], 'date':row['date'], 'sample':n, 'density':row['density'], 'quantity':row['total']}
            d.append(c)
    return d
In [49]:
slr_pcs_m = get_sample_no(slr_dens_date)
In [50]:
slr_p_m = pd.DataFrame(slr_pcs_m)
slr_p_m = slr_p_m[['location', 'date', 'sample', 'density', 'quantity']]
slr_p_m['date'] = slr_p_m['date'].dt.strftime("%Y-%m-%d")
# slr_p_m.set_index('date', inplace=True)
slr_p_m['density'] = slr_p_m['density'].round(4)
slr_p_m['quantity'] = slr_p_m['quantity'].astype(int)
slr_p_m['sample'] = slr_p_m['sample'].astype(int)
In [51]:
slr_p_m.set_index('location', inplace=True)
In [52]:
slr_p_m.to_csv('data/slr_pieces.csv')
In [53]:
droped = pd.read_csv('data/no_area.csv')
In [54]:
len(droped)
Out[54]:
84
In [55]:
droped.TOTAL.sum()
Out[55]:
3898
In [56]:
95971 - 3898
Out[56]:
92073