#!/usr/bin/env python
# coding: utf-8

# # A Cloud Price Comparison
# How do cloud providers stack up?
#  
# Making apples to apples comparisons between different cloud providers is very difficult, because each one offers instances with varying vCPUs, RAM, SSD space and HDD space. To further obfuscate matters, slightly different billing systems, promises of arcane discounting, only providing pricing in USD, and inconsistent naming conventions are sprinkled throughout.
# 
# As an attempt to provide a clearer price comparison, I'll be using [multiple linear regression](https://en.wikipedia.org/wiki/Linear_regression) to "[normalise](http://bit.ly/2xIUM5C)" the pricing of general purpose compute instances across different cloud providers.
# 
# In essence, **If every cloud provider offered the same size compute instances, how expensive would they be?**

#  ## Importing libraries

# In[1]:


import pandas as pd
import os
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import requests
import json
import re
from bs4 import BeautifulSoup


# ## The dataset
# 
# I'll be taking the price tables of:
# * Google Cloud - [Predefined machine types](https://cloud.google.com/compute/pricing#predefined_machine_types)
# * AWS - [On demand instances](https://aws.amazon.com/ec2/pricing/on-demand/)
# * Azure - [Linux virtual machines](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/)
# 
# and converting them into the instance sizes offered by [Catalyst Cloud](https://catalystcloud.nz/services/iaas/compute/#prices). You can find the datasets and their sources [here](https://github.com/catalyst-cloud/catalystcloud-price-comparison).

# ### Initialisation
# Assuring dataset directories exist for later data storage.

# In[2]:


def create_dir_if_not_exists(rel_dir_path):
    if not os.path.exists(rel_dir_path):
        os.makedirs(rel_dir_path)
        
create_dir_if_not_exists('dataset')
create_dir_if_not_exists('predicted-dataset')


# ### Exchange rate
# Getting the USD to NZD exchange rate

# In[3]:


usd_to_nzd_exchange_rate_url = 'https://api.exchangeratesapi.io/latest?base=USD&symbols=NZD'

usd_to_nzd_exchange_rate_json = requests.get(usd_to_nzd_exchange_rate_url).json()
usd_to_nzd_exchange_rate = float(usd_to_nzd_exchange_rate_json['rates']['NZD'])
print(f"Current exchange rate from USD to NZD is {usd_to_nzd_exchange_rate}")


# ### Catalyst Cloud prices
# Scraping the Catalyst Cloud compute page for prices. As Catalyst Cloud currently only offers general purpose compute instances, we'll be scraping all of them.

# In[4]:


# Variables

catalyst_url = 'https://catalystcloud.nz/services/iaas/compute'
catalyst_data_location = 'dataset/catalyst_price_data.csv'
catalyst_gst_exclusive = True


# In[5]:


catalyst_price_page_html = requests.get(catalyst_url).text 
catalyst_price_page = BeautifulSoup(catalyst_price_page_html, 'html.parser')

catalyst_price_table = catalyst_price_page.find(attrs={'class': 'service-price-table'}).tbody
catalyst_price_rows = catalyst_price_table.find_all('tr')

catalyst_prices_list = []

for row in catalyst_price_rows:
    catalyst_price_cells = list(row.stripped_strings)
    
    catalyst_prices_list.append({
        'Name': catalyst_price_cells[0],
        'vCPU': float(catalyst_price_cells[1]),
        'RAM, GB': float(catalyst_price_cells[2]),
        'Price per hour, NZD (ex GST)': float(catalyst_price_cells[3].strip('$')),
        'SSD storage, GB': .0,
        'HDD storage, GB': .0
    })

# Convert to csv
catalyst_dataframe = pd.DataFrame(catalyst_prices_list)
catalyst_dataframe.to_csv(catalyst_data_location)

print('Downloaded Catalyst prices, with {} items.'.format(catalyst_dataframe.shape[0]))


# ### AWS prices
# 
# Pulling price list from AWS JSON API.

# In[6]:


# Variables

aws_url = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/ap-southeast-2/index.json'
aws_raw_location = 'dataset/raw_aws_bulk.json'
aws_acceptable_instance_families = [
    'General purpose',
    'Micro instances'
]
aws_data_location = 'dataset/aws_price_data.csv'
aws_gst_exclusive = True


# In[7]:


aws_bulk_json_request = requests.get(aws_url)
aws_bulk_json = aws_bulk_json_request.json()
with open(aws_raw_location, 'w') as aws_raw_file:
    json.dump(aws_bulk_json, aws_raw_file)


# Extracting the specific relevant prices from the raw AWS file, and putting them in a consistant, usable format. In this case, we'll only be using the General purpose and micro size instances.

# In[8]:


# Getting the instance products

with open(aws_raw_location, 'r') as aws_raw_file:
    aws_raw_json = json.load(aws_raw_file)
        
    aws_instances_list = []
            
    for product in aws_raw_json['products']:
        
        productFamily = aws_raw_json['products'][product]['productFamily']
        
        # Check product is compute instance
        if productFamily == 'Compute Instance':
                        
            # Check if instance is appropriate
            instanceFamily = aws_raw_json['products'][product]['attributes']['instanceFamily']
            is_current_gen = aws_raw_json['products'][product]['attributes']['currentGeneration'] == 'Yes'
            is_linux = aws_raw_json['products'][product]['attributes']['operatingSystem'] == 'Linux'
            no_preInstalledSw = aws_raw_json['products'][product]['attributes']['preInstalledSw'] == 'NA'
            is_shared_instance = aws_raw_json['products'][product]['attributes']['tenancy'] == 'Shared'

            if instanceFamily in aws_acceptable_instance_families and is_current_gen \
                and is_linux and no_preInstalledSw and is_shared_instance:
                
                # Append if appropriate
                aws_instances_list.append(product)


# In[9]:


with open(aws_raw_location, 'r') as aws_raw_file:
    
    aws_prices_list = []
    
    for instance_key in aws_instances_list:

        attributes = aws_raw_json['products'][instance_key]['attributes']
                    
        # Get vCPU and RAM
        vCPU = float(attributes['vcpu'].replace(',',''))
        RAM = float(attributes['memory'].split(' ')[0].replace(',',''))

        # Break storage spec into array
        storage_strings = attributes['storage'].split(' ')

        # Find where the numbers end (200 x 1), and the description of the storage type (SSD) starts.
        final_num_index = None
        for word in storage_strings[::-1]:
            try:
                float(word.replace(',', ''))
                final_num_index = storage_strings.index(word)
                break
            except:
                foo = None

        # If there are no numbers in the storage spec, there is no storage included
        if final_num_index == None:

            total_ssd = .0
            total_hdd = .0

        # Else...
        else:

            # Perform the math to figure out how many GB of storage is included
            storage_calcs = storage_strings[0:final_num_index+1]
            storage_volume = eval(' '.join(['*' if x=='x' else x.replace(',', '') for x in storage_calcs]))

            # discern the type of storage
            if 'HDD' in storage_strings:                        
                total_ssd = .0
                total_hdd = float(storage_volume)

            elif 'SSD' in storage_strings:                        
                total_ssd = float(storage_volume)
                total_hdd = .0
            else: 
                total_ssd = float(storage_volume)
                total_hdd = .0


        # Get the price per USD
        terms = aws_raw_json['terms']['OnDemand'][instance_key]
        usd_price = None
        for specific_term in terms:
            for dimension_key in terms[specific_term]['priceDimensions']:
                dimension = terms[specific_term]['priceDimensions'][dimension_key]
                if dimension['unit'] != 'Hrs': raise ValueError("This price isn't in hours")
                usd_price = float(dimension['pricePerUnit']['USD'])
                
        if not usd_price:
            continue

        # Convert to NZD
        nzd_price = usd_price * usd_to_nzd_exchange_rate
                
        # Append to list of prices
        aws_prices_list.append({
            'Name': attributes['instanceType'],
            'vCPU': vCPU,
            'RAM, GB': RAM,
            'Price per hour, NZD (ex GST)': nzd_price,
            'Price per hour, USD (ex GST)': usd_price,
            'SSD storage, GB': total_ssd,
            'HDD storage, GB': total_hdd,
        })

# Convert to CSV
aws_dataframe = pd.DataFrame(aws_prices_list)
aws_dataframe.to_csv(aws_data_location)

print('Downloaded AWS prices, with {} items.'.format(aws_dataframe.shape[0]))


# ### Google Cloud prices
# 
# Scraping Google Cloud's documentation for prices of instance sizes.

# In[11]:


# Variables

google_prices_url = 'https://cloud-dot-devsite-v2-prod.appspot.com/compute/all-pricing_f139b6960aafba0f57015e855bf5748ad892aa7e94f9dee00434116c29ca229a.frame'
google_instances_url = 'https://cloud-dot-devsite-v2-prod.appspot.com/compute/all-pricing_70247bb78d85862a2b290545ac82cd3c0f4e0e7aa5ea1092e8dcba180b24ab80.frame'
google_price_type = 'syd-hourly'
google_acceptable_instance_families = [
    'standard_machine_types'
]
google_data_location = 'dataset/google_price_data.csv'
google_gst_exclusive = True


# In[12]:


google_price_page_html = requests.get(google_prices_url).text
google_price_page = BeautifulSoup(google_price_page_html, 'html.parser')
google_instance_page_html = requests.get(google_instances_url).text
google_instance_page = BeautifulSoup(google_instance_page_html, 'html.parser')


# Scraping the custom compute sizes, for easier pricing later.

# In[13]:


# Extract the USD price per vCPU and per GB RAM
google_custom_compute_price_table = google_price_page.find('table')
google_custom_compute_rows = google_custom_compute_price_table.find_all('tr')[1:]

google_per_vcpu_usd = float(google_custom_compute_rows[0].find_all('td')[1][google_price_type].split()[0].strip('$'))
google_per_ram_usd = float(google_custom_compute_rows[1].find_all('td')[1][google_price_type].split()[0].strip('$'))


# In[14]:


def most_freq_num(text):
    number_list = re.findall('\d*\.?\d+', text)
    most_frequent_num = max(set(number_list), key=number_list.count)
    return float(most_frequent_num)


# Scraping the predefined instance size price table. In this case, we'll only be scraping the standard machine types.

# In[15]:


google_prices_list = []

for instance_type in google_acceptable_instance_families:
    
    google_price_table = google_instance_page.find('table')
    google_rows = google_price_table.find_all('tr')[1:-1]
        
    for row in google_rows:
        
        # Extract number of vCPUs and GB of RAM
        try:
            cells = row.find_all('td')
            name = cells[0].get_text().strip()
            # Ignore if has lake in name (to remove skylake instances)
            if 'lake' in name:
                continue
            cpu_val = most_freq_num(str(cells[1]))
            ram_val = most_freq_num(str(cells[2]))
        except:
            foo='bar'
                        
        # Calcluate NZD price
        usd_price = (google_per_ram_usd * ram_val) + (google_per_vcpu_usd * cpu_val)
        nzd_price = usd_price * usd_to_nzd_exchange_rate
        
        try:
            google_prices_list.append({
                'Name': name,
                'vCPU': cpu_val,
                'RAM, GB': ram_val,
                'Price per hour, NZD (ex GST)': nzd_price,
                'Price per hour, USD (ex GST)': usd_price,
                'SSD storage, GB': .0,
                'HDD storage, GB': .0,
            })
        except:
            continue

google_dataframe = pd.DataFrame(google_prices_list)
google_dataframe.to_csv(google_data_location)

print('Downloaded Google prices, with {} items.'.format(google_dataframe.shape[0]))


# ### Azure cloud prices
# 
# Scraping the Azure on demand price pages. In this case, we'll only be scraping those listed as general purpose.

# In[17]:


# Variables
azure_url = 'https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/'
azure_data_location = 'dataset/azure_price_data.csv'
azure_acceptable_categories = [
    'general-purpose-filter'
]
azure_acceptable_regions = [
    'australia-central',
    'australia-central-2',
    'australia-east',
    'australia-southeast'
]
azure_ssd_temp_disk_vms = [
    r'A\d.v2', # Regex for Av2-series
    r'D\d{1,2}.v2', # Regex for Dv2 series
    r'D\d{1,2}.v3' # Regex for Dv3 series
] 
azure_gst_exclusive = True


# In[18]:


azure_price_page_html = requests.get(azure_url).text
azure_price_page = BeautifulSoup(azure_price_page_html, 'html.parser')


# In[19]:


# Find all appropriate price tables
appropriate_price_tables = []
for category in azure_acceptable_categories:
    category_section = azure_price_page.find_all(attrs={'data-filter': category})[0]
    appropriate_price_tables += category_section.find_all('table')

# Extract data from tables
azure_instance_list = []
for price_table in appropriate_price_tables:
    table_rows = price_table.tbody.find_all('tr')
        
    # Work through each row
    for row in table_rows:
        
        cells = row.find_all('td')
        
        # Find a price by checking if each of the acceptable regions have a price
        usd_price = None
        price_json = json.loads(cells[5].span['data-amount'])['regional']
        for region in azure_acceptable_regions:
            if region in price_json:
                usd_price = price_json[region]
                break
                
        # If none of the regions have it, skip this row
        if usd_price is None:
            continue
        
        # Get NZD price
        nzd_price = usd_price * usd_to_nzd_exchange_rate
        
        # Get name
        name = cells[1].string
        
        # Get CPU value
        cpu_val = float(cells[2].string)
        
        # Get Ram value
        ram_string = cells[3].string
        ram_string = ram_string.strip(' GiB')
        ram_val = float(ram_string)
        
        # Get storage value
        storage_string = cells[4].string
        storage_string = storage_string.strip(' GiB')
        storage_string = storage_string.replace(',', '')
        storage_val = float(storage_string)
        
        # Get storage type
        for regex_string in azure_ssd_temp_disk_vms:
            pattern = re.compile(regex_string)
            if pattern.match(name):
                storage_type = 'SSD'
            else:
                storage_type = 'HDD'
                
        
        if storage_type is 'HDD':
            azure_instance_list.append({
                'Name': name,
                'vCPU': cpu_val,
                'RAM, GB': ram_val,
                'Price per hour, NZD (ex GST)': nzd_price,
                'Price per hour, USD (ex GST)': usd_price,
                'SSD storage, GB': .0,
                'HDD storage, GB': storage_val
            })
        elif storage_type is 'SSD':
            azure_instance_list.append({
                'Name': name,
                'vCPU': cpu_val,
                'RAM, GB': ram_val,
                'Price per hour, NZD (ex GST)': nzd_price,
                'Price per hour, USD (ex GST)': usd_price,
                'SSD storage, GB': storage_val,
                'HDD storage, GB': .0
            })
        else:
            raise ValueError('Unknown storage type.')

pd.DataFrame(azure_instance_list).to_csv(azure_data_location)

azure_dataframe = pd.DataFrame(azure_instance_list)
azure_dataframe.to_csv(azure_data_location)

print('Downloaded Azure prices, with {} items.'.format(azure_dataframe.shape[0]))


# ### Ingesting the datasets
# 
# Importing the datasets from their saved locations.

# In[21]:


catalyst_dataset = pd.read_csv(catalyst_data_location, index_col=0)
google_dataset = pd.read_csv(google_data_location, index_col=0)
aws_dataset = pd.read_csv(aws_data_location, index_col=0)
azure_dataset = pd.read_csv(azure_data_location, index_col=0)


# Previewing the datasets.

# In[22]:


catalyst_dataset.head(6)


# Now we'll filter out excessively large instance sizes. Namely, those with either more RAM than 360GB, or more that 64 vCPUs. 
# 
# This is to prevent the graph including very large instances for comparison, and leaving the smaller instances compared on a scale too small to visually distinguish between.
# 
# Remember, we're only dealing with general purpose compute here, and for that application, small instances are often an important tool. Consequently, it is worthwhile to be specific.

# In[23]:


def filter_dataset (dataset):
    without_high_ram = dataset[(dataset['RAM, GB'] <= 360) & (dataset['vCPU'] <= 64)]
    
    return without_high_ram


# In[24]:


catalyst_dataset = filter_dataset(catalyst_dataset)
google_dataset = filter_dataset(google_dataset)
aws_dataset = filter_dataset(aws_dataset)
azure_dataset = filter_dataset(azure_dataset)


# Now we'll split the data into NumPy arrays of input features (X) and labels (Y).

# In[25]:


def split_dataset (dataset):
    x = dataset[["vCPU", "RAM, GB", "HDD storage, GB", "SSD storage, GB"]].values
    y = dataset["Price per hour, NZD (ex GST)"].values
    
    return (x, y)


# In[26]:


catalyst_x, catalyst_y = split_dataset(catalyst_dataset)
google_x, google_y = split_dataset(google_dataset)
aws_x, aws_y = split_dataset(aws_dataset)
azure_x, azure_y = split_dataset(azure_dataset)


# ## The math
# To analyse this dataset, we'll be using [multiple linear regression](https://en.wikipedia.org/wiki/Linear_regression) to predict the prices of compute instance flavors if they were being offered by cloud providers that do not typically offer that sized flavor.
# 
# The multiple linear regression models will draw a hyperplane across our data space (in this case, 5 dimensional space) that comes as close as possible to intersecting every data point in our dataset. You can see an example of this (in 2 dimensional space) by Khan academy below.

# In[27]:


from IPython.core.display import display, HTML
display(HTML('<iframe width="840" height="472" src="https://www.youtube-nocookie.com/embed/GAmzwIkGFgE?rel=0" frameborder="0" allowfullscreen></iframe>'))


# By then finding a point on the hyperplane that intersects with our vCPUs, RAM, HDD, and SSD axes, we can find the predicted price. This will give us a way to estimate the price of a flavour if it were offered by various cloud providers, even if they do not offer it.
# 
# I've used linear regression as the predictive algorithm because I'm assuming that cloud providers scale their pricing in a linear patern.
# 
# First we'll initialise the regression models, and train them on the cloud providers' prices.

# In[28]:


# Initialise regressors
catalyst_linear = LinearRegression()
google_linear = LinearRegression()
aws_linear = LinearRegression()
azure_linear = LinearRegression()

# Train regressors
catalyst_linear.fit(catalyst_x, catalyst_y)
google_linear.fit(google_x, google_y)
aws_linear.fit(aws_x, aws_y)
azure_linear.fit(azure_x, azure_y)


# Now we have the models predict the other providers' instance prices. By having all providers predict the prices of all other providers, we can see if the pattern is maintained across different models.

# In[29]:


# Predict Catalyst X
google_cata_price = google_linear.predict(catalyst_x)
aws_cata_price = aws_linear.predict(catalyst_x)
azure_cata_price = azure_linear.predict(catalyst_x)

# Predict Google X
aws_goog_price = aws_linear.predict(google_x)
azure_goog_price = azure_linear.predict(google_x)
catalyst_goog_price = catalyst_linear.predict(google_x)

# Predict AWS X
google_aws_price = google_linear.predict(aws_x)
azure_aws_price = azure_linear.predict(aws_x)
catalyst_aws_price = catalyst_linear.predict(aws_x)

# Predict Azure X
google_azure_price = google_linear.predict(azure_x)
aws_azure_price = aws_linear.predict(azure_x)
catalyst_azure_price = catalyst_linear.predict(azure_x)


# ## The results
# 
# Now we have the results together, where we can compare the prices against each other on an even scale.
# 
# A good scientist would, at this point, verify their results by comparing an intersection between the predicted output and the actual output. I would love to do this. However I could find no such intersection.
# 
# Please note that the X axis is a range from zero to the number of flavors offered by each provider. Each number on the X axis represents a single flavor by the provider we are predicting. I've done it this way, because the plotting method does not support non-numerical axis ticks.

# In[30]:


def graph_pred (names, predictions):
    flavors_num = predictions[0].shape[0]
    for index, name in enumerate(names):
        plt.plot(range(flavors_num), predictions[index], label=names[index])
    plt.legend(loc=2)


# ### Predicting Catalyst Cloud

# In[31]:


graph_pred([
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_y, google_cata_price, aws_cata_price, azure_cata_price
    ])


# ### Predicting Google Cloud

# In[32]:


graph_pred([
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_goog_price,
        google_y,
        aws_goog_price,
        azure_goog_price,
        
    ])


# ### Predicting AWS

# In[33]:


graph_pred([
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_aws_price,
        google_aws_price,
        aws_y,
        azure_aws_price,
    ])


# ### Predicting Azure

# In[34]:


graph_pred([
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_azure_price,
        google_azure_price,
        aws_azure_price,
        azure_y
    ])


# ### Interpreting the results
# 
# As you can see, the multiple linear regression model consistently predicts that Catalyst Cloud would offer either the cheapest, or competitively priced compute instances.
# 
# While this is by no means a perfect indicator of who is the cheapest and should not be taken as such, it does serve to dispel the idea that Catalyst Cloud is overpriced, or cannot compete with international companies on price.

# ### Saving the results
# For further analysis later, we'll save the predicted prices as CSVs.

# In[35]:


def pred_save (origin_flavors, provider_names, predictions):
    
    flavor_data = origin_flavors[["Name", "vCPU", "RAM, GB", "HDD storage, GB", "SSD storage, GB"]]
    
    print(type(origin_flavors))
    
    for index, provider in enumerate(predictions):
        unit_string = ' price per hour, NZD (ex GST)'
        company_name = provider_names[index]
        flavor_data[company_name + unit_string] = predictions[index]
    
    return flavor_data


# In[36]:


final_cat_data = pred_save(
    catalyst_dataset,
    [
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_y, google_cata_price, aws_cata_price, azure_cata_price
    ])


# In[37]:


final_google_data = pred_save(
    google_dataset,
    [
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_goog_price,
        google_y,
        aws_goog_price,
        azure_goog_price,
        
    ])


# In[38]:


final_aws_data = pred_save(
    aws_dataset,
    [
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_aws_price,
        google_aws_price,
        aws_y,
        azure_aws_price,
    ])


# In[39]:


final_azure_data = pred_save(
    azure_dataset,
    [
        "Catalyst", "Google", "AWS", "Azure"
    ], [
        catalyst_azure_price,
        google_azure_price,
        aws_azure_price,
        azure_y
    ])


# In[40]:


final_cat_data.to_csv('predicted-dataset/predicted_catalyst_prices.csv')
final_google_data.to_csv('predicted-dataset/predicted_google_prices.csv')
final_aws_data.to_csv('predicted-dataset/predicted_aws_prices.csv')
final_azure_data.to_csv('predicted-dataset/predicted_azure_prices.csv')


# In[41]:


print('Saving resulting datasets.')


# In[ ]: