#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
get_ipython().run_line_magic('matplotlib', 'inline')


# # What are items are generating the most OOPE?
# 
# * The most expensive single presentation is Cinacalcet HCl_Tab 30mg, costing £16,000 per month in OOPE
#   * Calcium and Magnesium supplements overall cost £25,000 per month in OOPE with a mean OOPE of £3.30
# * The most expensive single OOPE is for Tretinoin Cap 10mg with OOPE cost of £80.99
# * The most expensive commonly-prescribed presentation is Solgar_Mag Cit Tab 200mg with mean OOPE per item of £9.86
# * At a BNF paragraph level, the most expensive things are:
#    * Vitamin D (£67,000 per month)
#    * Foods for Special Diets (£52,000 per month)
#    * Enteral Nutrition (£43,000 per month)

# A [recent case of a pharmacist being suspended](https://www.pharmacyregulation.org/sites/default/files/determinations/tasleem_fatma_2063889_determination.pdf) gives some background to how OOPE overcharging might happen. In this case (2013)it was carelessness and inexperience:
# 
#  * It affected Aciferol D3, Amlostin, Glutafin G/F bread, Mepradec capsules, ProD3 4, Simvador tablets
#  * These were branded generics, prescribed as brand by GP on grounds they were cheaper (10%) that DT price
#  * But there were not available from the usual wholesaler
#  * They were therefore put through as "specials" to their alternative wholesaler (Pharmacare)
#  * Each order to Pharmacare incurs a £50 plus VAT handling fee and a £15 plus VAT administration fee
# 
# It was confirmed [on Twitter](https://twitter.com/CPPEOli/status/1082224683167244289) that the £36.98 amount mentioned below is the standard "special obtains" fee charged by Alliance; which they will always charge for items they don't usually hold.
# 
# 

# In[2]:


sql = """
SELECT
  SUM(item_count) AS items,
  SUM(item_pay_oope_amt) AS oope,
  bnf_code,
  bnf_name
FROM
  `ebmdatalab.dispensers.dispensing_with_metadata`
WHERE year_month = '201703' AND item_pay_oope_amt > 0
GROUP BY bnf_code, bnf_name
ORDER BY SUM(item_pay_oope_amt) DESC
LIMIT 1000
"""

items = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard')


# In[3]:


items['oope_per_item'] = items['oope'] / items['items']
items.head(3)


# In[4]:


items[items['items'] > 100].sort_values('oope_per_item', ascending=False).head(3)


# In[5]:


# Grouped at BNF paragraph level
items['bnf_para'] = items['bnf_code'].str.slice(0,6)
items.groupby('bnf_para').agg(['mean', 'sum']).sort_values(('oope', 'sum'), ascending=False).head(10)


# # How does it change over time?
# 
# In the period for which we have data, OOPE has decreased overall.  This is because the number of items having OOPE added is decreasing; however, the amount of OOPE being added per item is increasing. See charts below.

# In[2]:


sql = """SELECT 
  PARSE_DATE("%Y%m_%d", CONCAT(year_month, "_01")) AS date,
  SUM(item_pay_oope_amt) AS oope,
  SUM(CASE WHEN item_pay_oope_amt > 0 THEN item_count ELSE 0 END) AS oope_items,
  SUM(item_count) AS items
FROM
  `ebmdatalab.dispensers.dispensing_with_metadata`
GROUP BY
  year_month
ORDER BY
  year_month
"""
dftime = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard')


# In[3]:


dftime['oope_per_item'] = dftime['oope'] / dftime['oope_items']
dftime.set_index('date')[['oope']].plot()
dftime.set_index('date')[['oope_per_item']].plot()
dftime.set_index('date')[['oope_items']].plot()


# In[8]:


# Look up by CCG
# Add a column in bigquery that matches postcode to CCG


sql = """SELECT 
  PARSE_DATE("%Y%m_%d", CONCAT(year_month, "_01")) AS date,
  SUM(item_pay_oope_amt) AS oope,
  SUM(CASE WHEN item_pay_oope_amt > 0 THEN item_count ELSE 0 END) AS oope_items,
  SUM(item_count) AS items
FROM
  `ebmdatalab.dispensers.dispensing_with_metadata`
  WHERE ccg = '08H'
GROUP BY
  year_month
ORDER BY
  year_month
"""
dftime = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard')


# In[7]:


dftime['oope_per_item'] = dftime['oope'] / dftime['oope_items']
dftime.set_index('date')[['oope']].plot()
dftime.set_index('date')[['oope_per_item']].plot()
dftime.set_index('date')[['oope_items']].plot()


# ## Is there interesting variation for multiples?

# In[17]:


# summarise OOPE spending per item, grouped by HQ
sql = """
SELECT
  hq_name,
  COUNT(DISTINCT organisation_code) AS branches,
  SUM(item_count) AS items,
  SUM(actual_cost) AS actual_cost,
  SUM(CASE WHEN item_pay_oope_amt > 0 THEN item_count ELSE 0 END) AS oope_items,
  SUM(item_pay_oope_amt) AS oope,
  MAX(item_pay_oope_amt/item_count) AS max_oope_per_item
FROM
  `ebmdatalab.dispensers.dispensing_with_metadata`
WHERE year_month = '201703'
GROUP BY
  hq_name
"""
df = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard')


# In[12]:


print("In a single month, there were {} branches, represented by {} HQs, prescribing {} items".format(
    df['branches'].sum(),
    df['branches'].count(),
    df['items'].sum()))
print("{} items had OOPE added".format(df['oope_items'].sum()))
print("Total OOPE per month £%s ({})" % (round(df['oope']).sum()))


# ## HQs with highest OOPE per item
# 
# If we only count HQs with at least 100 OOPE items per month, per branch, then the top-spending (per OOPE item) HQ charges an average of £57 per OOPE item, and 6% of its items have OOPE added.  It charges £10,869 of OOPE in one month.
# 
# Of the top 10 HQs, only one was a multiple (i.e. associated with more than contractor/location).
# 

# In[23]:


df['oope_per_item'] = df['oope'] / df['items']
df['oope_per_oope_item'] = df['oope'] / df['oope_items']
df['percent_oope_items'] = round(df['oope_items'] / df['items'] * 100)
df['percent_oope_cost'] = round(df['oope'] / df['actual_cost'] * 100)
df = df.sort_values('oope_per_oope_item', ascending=False)
common_oopers = df[df['oope_items']/df['branches'] > 100]


# In[24]:


common_oopers.head(13)


# In[30]:


common_oopers.sort_values('percent_oope_cost', ascending=False).head(5)


# ## Distribution of OOPE per item by HQ
# The  majority of HQs have no OOPE at all. All but six have their mean OOPE across all items of under 50p; 87% had a mean OOPE of less than 1p.  In the first chart, note the log scale!

# In[ ]:


print("{}% of contracts never charge any OOPE".format(round(df[df['oope'] == 0].count().iloc[0] / df.count().iloc[0] * 100)))


# In[207]:


bins = pd.cut(df.oope_per_oope_item, 209)
df.groupby(bins)['oope_per_oope_item'].agg(['count']).head()


# In[96]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
df.hist(ax=ax, column=['oope_per_item'], bins=50)
ax.set_yscale('log')
ax.set_ylabel('HQ count (log scale)')
ax.set_xlabel('Mean OOPE per item (£)')


# In[97]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
df.hist(ax=ax, column=['oope_per_oope_item'], bins=50)
ax.set_ylabel('HQ count')
ax.set_xlabel('Mean OOPE per item that has OOPE (£)')


# ## How does high OOPE covary with group size?
# 
# None of the large multiples has high mean OOPEs:

# In[98]:


df.plot.scatter(x='branches', y='oope_per_oope_item')


# # Is there anything different about the prescriptions being dispensed?
# 
# That is, do the HQs with the highest mean OOPE prescribe different amounts of kinds of things from those with the lowest?

# In[41]:


print("The 10 HQs with highest mean OOPE account for %s items" % df['items'].head(10).sum())
print("The 10 HQs with lowest mean OOPE account for %s items" % df['items'].tail(10).sum())


# ## Investigate lowest and highest HQs

# In[115]:


sql = """
SELECT
  item_pay_dr_nic,
  item_count,
  item_pay_oope_amt,
  bnf_code,
  bnf_name,
  name,
  postcode
FROM
  `ebmdatalab.dispensers.dispensing_with_metadata`
WHERE year_month = '201703' AND 
(%s)
"""
hq_names = []
for name in list(common_oopers.hq_name.head(10)):
    hq_names.append("hq_name = '%s'" % name)
highest = pd.io.gbq.read_gbq(sql % " OR ".join(hq_names), 'ebmdatalab', dialect='standard')


# In[106]:


sql = """
SELECT
  item_pay_dr_nic,
  item_count,
  item_pay_oope_amt,
  bnf_code,
  bnf_name,
  name,
  postcode
FROM
  `ebmdatalab.dispensers.dispensing_with_metadata`
WHERE year_month = '201703' AND 
(%s)
"""
hq_names = []
for name in list(df.hq_name.tail(10)):
    hq_names.append("hq_name = '%s'" % name)
lowest = pd.io.gbq.read_gbq(sql % " OR ".join(hq_names), 'ebmdatalab', dialect='standard')


# ### The 10 HQs with highest OOPE per items
# ...are accounted for by 12 dispensers, with a mean item count of 10300, mean NIC of £73,000, and mean OOPE of £5550. 3.5% of prescription items come with OOPEs (compared with 0 for the best dispensers). In a single month they charged £69,000 in OOPE, equivalent to a 9% surcharge (the highest being 22%). 67% of dispensers have NO OOPE.

# In[127]:


totals = highest.groupby("name").sum().sort_values('item_pay_oope_amt', ascending=False)
totals['uplift'] = totals['item_pay_oope_amt']/totals['item_pay_dr_nic']
totals


# In[130]:


totals.mean()


# ### The 10  HQs with lowest OOPE per item 
# ...are accounted for by 10 dispensers, with a mean item count of 7739, mean NIC of £58,000, and mean OOPE of £0 (indeed, a maximum OOPE of £0)

# In[131]:


lowest.head(1)


# In[132]:


lowest.groupby("name").sum()


# In[133]:


lowest.groupby("name").sum().mean()


# ## Do the highest and lowest dispense very different things?
# 
# Descriptive statistics for the "highest" and "lowest" groups are very similar.
# 
# The mean OOPE per item (where OOPE was added at all) in the "highest" group was £36 (and 0 in the "lowest"). The "highest" group had about 120 presentations-with-OOPE not seen in the "lowest" group; for those presentations, the mean OOPE per item was £41; OOPE presentations also dispensed in the "lowest" group had a mean OOPE per item of £26.
# 
# Therefore it looks like the "highest" group do routinely add more OOPE for everything; but a lot more for things only they see.
# 
# 
# 

# In[186]:


import numpy as np
highest['oope_count'] = np.where(highest['item_pay_oope_amt'] > 0, highest['item_count'], 0)
df3 = highest.groupby('bnf_name').sum()
df3['oope_per_oope_item'] = df3['item_pay_oope_amt'] / df3['oope_count']
highest_presentations = df3.sort_values('oope_per_oope_item', ascending=False)
highest_presentations[~np.isnan(highest_presentations['oope_per_oope_item'])].head()


# The following shows that in the HQs with the highest OOPE per items, 3746 different presentations were dispensed, of which 286 had OOPE, with a mean OOPE per item of £36

# In[144]:


highest_presentations.describe()


# And the next table shows in the HQs with the lowest OOPE per items, 3368 items were dispensed with no OOPE

# In[155]:


df3 = lowest.groupby('bnf_name').sum()
lowest['oope_count'] = np.where(lowest['item_pay_oope_amt'] > 0, lowest['item_count'], 0)
df3 = lowest.groupby('bnf_name').sum()
df3['oope_per_oope_item'] = df3['item_pay_oope_amt'] / df3['oope_count']
lowest_presentations = df3.sort_values('oope_per_oope_item', ascending=False)
lowest_presentations.describe()


# We can combine the two tables together, to find presentations common to both high and low OOPE dispensers.

# In[159]:


# Now compare these
#baddy_presentations = df3[(df3['oope_per_item'] > 0) & (df3['item_count'] > 10)]
compared = highest_presentations.merge(
    lowest_presentations,
    left_index=True,
    right_index=True,
    suffixes=["_high", "_low"],
    how="outer"
)
compared.sort_values("oope_per_oope_item_high", ascending=False).head()


# In[160]:


import numpy as np
good_only = compared[np.isnan(compared['item_count_high'])]
bad_only = compared[np.isnan(compared['item_count_low'])]
both = compared[(~np.isnan(compared['item_count_low'])) & (~np.isnan(compared['item_count_high']))]


# There are 758 presentations only prescribed in the dispensaries with the lowest OOPE:

# In[161]:


good_only.describe()


# ...and 117 presentations-that-have-OOPE which only prescribed in those with the highest OOPE

# In[162]:


bad_only[bad_only['item_pay_oope_amt_high'] > 0].describe()


# ...and 2498 in both, of which 192 have any OOPE:

# In[163]:


both[both['item_pay_oope_amt_high'] > 0].describe()


# In[174]:


# Total OOPE spent per item, for things prescribed in both places - top 5 (of 192 with any OOPE)
both['oope_per_oope_item_high'].sort_values(ascending=False).head()


# In[175]:


# Total OOPE spent per item, for things prescribed in only high-oope places - top 5 (of 103 with any OOPE)
bad_only['oope_per_oope_item_high'].sort_values(ascending=False).head()


# # Finally, there is a weird £36.98 thing going on
# I've noticed a lot of the things with high OOPE have identical OOPE-per item - for example, £20.99, £36.98, and £79, all examined here.
# 
# These appear to be fixed OOPE prices, regardless of the item - for example, the relatively common `Bio-Vitamin D3_Cap 800u` costs 44p per pack, but £36.98 fixed OOPE.
# 
# At the other end, an unusual special like `Levocarnitine_Oral Soln Paed 1.5g/5ml` costs £1571 per pack, and still £36.98 fixed OOPE.
# 
# 

# In[43]:


sql = """
SELECT
  *
FROM
  dispensers.dispensing_with_metadata
WHERE
  (item_pay_oope_amt / item_count = 36.98
    OR (item_pay_oope_amt / item_count) = 20.99 
    OR (item_pay_oope_amt / item_count) = 79.0)
  AND year_month = '201703'
"""
df = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard')


# In[44]:


df['oope_per_nic'] = (df['item_pay_oope_amt'] / df['item_count']) / df['item_pay_dr_nic']


# In[45]:


print("A total of {} spent in OOPE for items with OOPE at these three levels".format(df['item_pay_oope_amt'].sum()))


# In[42]:


df[['bnf_name', 'bnf_code', 'item_pay_dr_nic', 'item_pay_oope_amt', 'oope_per_nic']].sort_values('oope_per_nic', ascending=False).head()


# The most common of these:

# In[38]:


df.groupby('bnf_name').agg('sum')['item_count'].sort_values(ascending=False).head(10)


# ## What DT categories are the OOPE items in?
# Only Category C items can be claimed, but:

# In[46]:


sql = """
SELECT
  vmpp,
  tariff_category,
  d.name,
  d.hq_name,
  d.bnf_code,
  item_count,
  item_pay_oope_amt,
  actual_cost
FROM
  dispensers.dispensing_with_metadata d
LEFT JOIN
  `dmd.dt_viewer` v
ON
  d.bnf_code = v.bnf_code
WHERE
  year_month = '201703'
  AND v.date = '2017-03-01'
  AND item_pay_oope_amt > 0
  AND tariff_category != "Part VIIIA Category C"
"""
df = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard')
df.head()


# In[47]:


print("A total of {} OOPE was paid on things that shouldn't".format(df.item_pay_oope_amt.sum()))


# In[48]:


df.groupby('tariff_category').count()


#