import pandas as pd
import numpy as np
import re
import json
import matplotlib.pyplot as plt
%matplotlib inline
db = json.load(open("dataset/foods.json.txt"))
len(db)
6636
db[0].keys()
dict_keys(['description', 'manufacturer', 'portions', 'id', 'tags', 'group', 'nutrients'])
db[0]['nutrients'][0]
{'description': 'Protein', 'group': 'Composition', 'units': 'g', 'value': 25.18}
nutrients = pd.DataFrame(db[0]['nutrients'])
nutrients.head()
description | group | units | value | |
---|---|---|---|---|
0 | Protein | Composition | g | 25.18 |
1 | Total lipid (fat) | Composition | g | 29.20 |
2 | Carbohydrate, by difference | Composition | g | 3.06 |
3 | Ash | Other | g | 3.28 |
4 | Energy | Energy | kcal | 376.00 |
info_keys = ['description', 'group', 'id', 'manufacturer']
info = pd.DataFrame(db, columns=info_keys)
info.head()
description | group | id | manufacturer | |
---|---|---|---|---|
0 | Cheese, caraway | Dairy and Egg Products | 1008 | |
1 | Cheese, cheddar | Dairy and Egg Products | 1009 | |
2 | Cheese, edam | Dairy and Egg Products | 1018 | |
3 | Cheese, feta | Dairy and Egg Products | 1019 | |
4 | Cheese, mozzarella, part skim milk | Dairy and Egg Products | 1028 |
info.describe()
id | |
---|---|
count | 6636.000000 |
mean | 15733.055304 |
std | 8715.095274 |
min | 1008.000000 |
25% | 10170.750000 |
50% | 14445.000000 |
75% | 19188.250000 |
max | 93600.000000 |
info.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6636 entries, 0 to 6635 Data columns (total 4 columns): description 6636 non-null object group 6636 non-null object id 6636 non-null int64 manufacturer 5195 non-null object dtypes: int64(1), object(3) memory usage: 207.5+ KB
pd.value_counts(info.group)[:10]
Vegetables and Vegetable Products 812 Beef Products 618 Baked Products 496 Breakfast Cereals 403 Fast Foods 365 Legumes and Legume Products 365 Lamb, Veal, and Game Products 345 Sweets 341 Fruits and Fruit Juices 328 Pork Products 328 Name: group, dtype: int64
# Now, to do some analysis on all of the nutrient data, it’s easiest to assemble the nutrients
# for each food into a single large table. To do so, we need to take several steps. First, I’ll
# convert each list of food nutrients to a DataFrame, add a column for the food id, and
# append the DataFrame to a list. Then, these can be concatenated together with concat:
nutrients = []
for rec in db:
fnuts = pd.DataFrame(rec['nutrients'])
fnuts['id'] = rec['id']
nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
nutrients.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 389355 entries, 0 to 389354 Data columns (total 5 columns): description 389355 non-null object group 389355 non-null object units 389355 non-null object value 389355 non-null float64 id 389355 non-null int64 dtypes: float64(1), int64(1), object(3) memory usage: 14.9+ MB
# check for duplicates,
nutrients.duplicated().sum()
14179
# dropping the dups
nutrients = nutrients.drop_duplicates()
# Since 'group' and 'description' is in both DataFrame objects, we can rename them to
# make it clear what is what:
col_mapping = {'description' : 'food',
'group' : 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
info.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6636 entries, 0 to 6635 Data columns (total 4 columns): food 6636 non-null object fgroup 6636 non-null object id 6636 non-null int64 manufacturer 5195 non-null object dtypes: int64(1), object(3) memory usage: 207.5+ KB
col_mapping = {'description' : 'nutrient',
'group' : 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 375176 entries, 0 to 389354 Data columns (total 5 columns): nutrient 375176 non-null object nutgroup 375176 non-null object units 375176 non-null object value 375176 non-null float64 id 375176 non-null int64 dtypes: float64(1), int64(1), object(3) memory usage: 17.2+ MB
# joining the data on column, using merge method
ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 375176 entries, 0 to 375175 Data columns (total 8 columns): nutrient 375176 non-null object nutgroup 375176 non-null object units 375176 non-null object value 375176 non-null float64 id 375176 non-null int64 food 375176 non-null object fgroup 375176 non-null object manufacturer 293054 non-null object dtypes: float64(1), int64(1), object(6) memory usage: 25.8+ MB
# checking 30000th row
ndata.ix[30000]
nutrient Glycine nutgroup Amino Acids units g value 0.04 id 6158 food Soup, tomato bisque, canned, condensed fgroup Soups, Sauces, and Gravies manufacturer Name: 30000, dtype: object
ndata.loc[50000]
nutrient Vitamin B-12 nutgroup Vitamins units mcg value 0.04 id 8185 food Cereals, RALSTON, cooked with water, with salt fgroup Breakfast Cereals manufacturer Name: 50000, dtype: object
# aggregate the data on nutrient, fgroup columns
# picking value column
#
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].order().plot(kind='barh')
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(...) if __name__ == '__main__':
<matplotlib.axes._subplots.AxesSubplot at 0x17155a3bf60>
result['Zinc, Zn'].order().plot(kind='box')
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(...) if __name__ == '__main__':
<matplotlib.axes._subplots.AxesSubplot at 0x1714fa72860>
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())
max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]
# make the food a little smaller
max_foods.food = max_foods.food.str[:50]
max_foods.ix['Amino Acids']['food']
nutrient Alanine Gelatins, dry powder, unsweetened Arginine Seeds, sesame flour, low-fat Aspartic acid Soy protein isolate Cystine Seeds, cottonseed flour, low fat (glandless) Glutamic acid Soy protein isolate Glycine Gelatins, dry powder, unsweetened Histidine Whale, beluga, meat, dried (Alaska Native) Hydroxyproline KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA... Isoleucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Leucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Lysine Seal, bearded (Oogruk), meat, dried (Alaska Na... Methionine Fish, cod, Atlantic, dried and salted Phenylalanine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Proline Gelatins, dry powder, unsweetened Serine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Threonine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Tryptophan Sea lion, Steller, meat with fat (Alaska Native) Tyrosine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Valine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Name: food, dtype: object