In [1]:
import numpy
import pandas
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
In [2]:
jobs_data = pandas.read_csv("ce_unadjusted.txt", sep="\t")

jobs_data = jobs_data[ jobs_data["period"] != "M13" ]

jobs_data["month"] = jobs_data["period"].map(lambda s: s[1:])
jobs_data["date"] = pandas.to_datetime(jobs_data["year"].astype(str) + "-" + jobs_data["month"])

jobs_data = jobs_data[ jobs_data["date"] > "2007-01-01" ]
In [4]:
jobs_data.head(30)
Out[4]:
series_id year period value footnote_codes month date
885 CEU0000000001 2007 M02 136026 02 2007-02-01
886 CEU0000000001 2007 M03 136923 03 2007-03-01
887 CEU0000000001 2007 M04 137730 04 2007-04-01
888 CEU0000000001 2007 M05 138673 05 2007-05-01
889 CEU0000000001 2007 M06 139175 06 2007-06-01
890 CEU0000000001 2007 M07 137778 07 2007-07-01
891 CEU0000000001 2007 M08 137890 08 2007-08-01
892 CEU0000000001 2007 M09 138455 09 2007-09-01
893 CEU0000000001 2007 M10 139198 10 2007-10-01
894 CEU0000000001 2007 M11 139510 11 2007-11-01
895 CEU0000000001 2007 M12 139297 12 2007-12-01
897 CEU0000000001 2008 M01 136268 01 2008-01-01
898 CEU0000000001 2008 M02 136787 02 2008-02-01
899 CEU0000000001 2008 M03 137378 03 2008-03-01
900 CEU0000000001 2008 M04 137914 04 2008-04-01
901 CEU0000000001 2008 M05 138488 05 2008-05-01
902 CEU0000000001 2008 M06 138682 06 2008-06-01
903 CEU0000000001 2008 M07 137203 07 2008-07-01
904 CEU0000000001 2008 M08 137094 08 2008-08-01
905 CEU0000000001 2008 M09 137148 09 2008-09-01
906 CEU0000000001 2008 M10 137442 10 2008-10-01
907 CEU0000000001 2008 M11 136761 11 2008-11-01
908 CEU0000000001 2008 M12 135732 12 2008-12-01
910 CEU0000000001 2009 M01 132042 01 2009-01-01
911 CEU0000000001 2009 M02 131808 02 2009-02-01
912 CEU0000000001 2009 M03 131675 03 2009-03-01
913 CEU0000000001 2009 M04 131859 04 2009-04-01
914 CEU0000000001 2009 M05 132132 05 2009-05-01
915 CEU0000000001 2009 M06 131950 06 2009-06-01
916 CEU0000000001 2009 M07 130353 07 2009-07-01
In [5]:
len(jobs_data)
Out[5]:
145
In [20]:
pyplot.plot(jobs_data["date"], jobs_data["value"])
pyplot.show()
In [18]:
model = LinearRegression().fit(jobs_data[["year"]], jobs_data["value"])
print(model.intercept_, model.coef_)
-2609662.5205453155 [1365.45122888]
In [7]:
jobs_data["change"] = jobs_data["value"].diff()
jobs_data = jobs_data.dropna()
In [8]:
pyplot.plot(jobs_data["date"], jobs_data["change"])
pyplot.show()
In [26]:
pyplot.plot(jobs_data.groupby(["month"]).change.mean())
pyplot.show()
In [9]:
month_dummies = pandas.get_dummies(jobs_data.month)
jobs_data = pandas.concat([jobs_data, month_dummies], axis=1)
jobs_data.head()
Out[9]:
series_id year period value footnote_codes month date change 01 02 03 04 05 06 07 08 09 10 11 12
886 CEU0000000001 2007 M03 136923 03 2007-03-01 897.0 0 0 1 0 0 0 0 0 0 0 0 0
887 CEU0000000001 2007 M04 137730 04 2007-04-01 807.0 0 0 0 1 0 0 0 0 0 0 0 0
888 CEU0000000001 2007 M05 138673 05 2007-05-01 943.0 0 0 0 0 1 0 0 0 0 0 0 0
889 CEU0000000001 2007 M06 139175 06 2007-06-01 502.0 0 0 0 0 0 1 0 0 0 0 0 0
890 CEU0000000001 2007 M07 137778 07 2007-07-01 -1397.0 0 0 0 0 0 0 1 0 0 0 0 0
In [27]:
model = LinearRegression().fit(jobs_data[["year", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]], jobs_data["change"])
In [28]:
print(model.intercept_, model.coef_)
-78687.40321484076 [   39.14131702 -3074.13859751   628.44473582   645.91938617
   849.7527195    716.66938617   327.5027195  -1308.83061383
   139.2527195    423.08605284   786.83605284   219.58605284
  -354.08061383]
In [29]:
predictions = model.predict(jobs_data[["year", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]])
pyplot.plot(jobs_data["date"], jobs_data["change"] - predictions)
#pyplot.plot(jobs_data["date"], predictions)
pyplot.show()