Ingests the raw data from the Bureau of Labor Statistics' Current Employement Statistics Program and combines it into a simple consolidated file.
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
values_df = pd.read_csv(
"bls/input/ce.data.0.ALL_CES_Series.tsv",
delimiter="\t"
)
datatype_df = pd.read_csv(
"bls/input/ce.datatype.tsv",
delimiter="\t",
skiprows=1,
dtype={0: str},
names=["data_type_code", "data_type_text", "blank"]
)
footnote_df = pd.read_csv(
"bls/input/ce.footnote.tsv",
delimiter="\t",
skiprows=1,
names=["footnote_code", "footnote_text", "blank"]
)
industry_df = pd.read_csv(
"bls/input/ce.industry.tsv",
delimiter="\t",
skiprows=1,
dtype={0: str},
names=["industry_code", "naics_code", "publishing_status", "industry_name", "display_level", "selectable", "sort_sequence", "blank"]
)
period_df = pd.read_csv(
"bls/input/ce.period.tsv",
delimiter="\t",
names=["period_code", "period_abbreviation", "period_name"]
)
seasonal_df = pd.read_csv(
"bls/input/ce.seasonal.tsv",
delimiter="\t"
)
series_df = pd.read_csv(
"bls/input/ce.series.tsv",
delimiter="\t",
dtype={
"supersector_code": str,
"industry_code": str,
"data_type_code": str,
}
)
supersector_df = pd.read_csv(
"bls/input/ce.supersector.tsv",
delimiter="\t",
skiprows=1,
dtype={0: str},
names=["supersector_code", "supersector_name", "blank"]
)
"{:,d}".format(len(values_df))
'7,088,149'
merged_df = pd.merge(
series_df,
values_df,
on="series_id"
)
len(merged_df)
7088149
merged_df = pd.merge(
merged_df,
supersector_df,
on="supersector_code"
)
"{:,d}".format(len(merged_df))
'7,088,149'
merged_df = pd.merge(
merged_df,
industry_df,
on="industry_code"
)
"{:,d}".format(len(merged_df))
'7,088,149'
merged_df = pd.merge(
merged_df,
datatype_df,
on="data_type_code"
)
"{:,d}".format(len(merged_df))
'7,088,149'
merged_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7088149 entries, 0 to 7088148 Data columns (total 26 columns): series_id object supersector_code object industry_code object data_type_code object seasonal object series_title object footnote_codes_x object begin_year int64 begin_period object end_year int64 end_period object year int64 period object value float64 footnote_codes_y object supersector_name object blank_x float64 naics_code object publishing_status object industry_name object display_level int64 selectable object sort_sequence int64 blank_y float64 data_type_text object blank float64 dtypes: float64(4), int64(5), object(17) memory usage: 1.4+ GB
merged_df.drop([
"blank_y",
"blank_x",
"footnote_codes_y",
"blank",
"supersector_code",
"industry_code",
"data_type_code",
"selectable",
"sort_sequence"
], axis=1, inplace=True)
merged_df.rename(
columns={
"footnote_codes_x": "footenote_code",
},
inplace=True
)
merged_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7088149 entries, 0 to 7088148 Data columns (total 17 columns): series_id object seasonal object series_title object footenote_code object begin_year int64 begin_period object end_year int64 end_period object year int64 period object value float64 supersector_name object naics_code object publishing_status object industry_name object display_level int64 data_type_text object dtypes: float64(1), int64(4), object(12) memory usage: 973.4+ MB
merged_df.to_csv("bls/output/bls_ce_transformed.csv", index=False)