%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
# Import the Pandas library
import pandas as pd
# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
#Print the `head` of the train and test dataframes
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
test.head()
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
train.to_csv('../data/tatanic_train.csv')
test.to_csv('../data/tatanic_test.csv')
You can easily explore a DataFrame
import pandas as pd
train = pd.read_csv('../data/tatanic_train.csv',\
sep = ",", header=0)
test = pd.read_csv('../data/tatanic_test.csv',\
sep = ",", header=0)
train.head()
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train.describe()
/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile RuntimeWarning)
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | NaN | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | NaN | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | NaN | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
train.shape#, len(train)
#train.columns
(891, 13)
# Passengers that survived vs passengers that passed away
train["Survived"]
0 0 1 1 2 1 3 1 4 0 5 0 6 0 7 0 8 1 9 1 10 1 11 1 12 0 13 0 14 0 15 1 16 0 17 1 18 0 19 1 20 0 21 1 22 1 23 1 24 0 25 1 26 0 27 0 28 1 29 0 .. 861 0 862 1 863 0 864 0 865 1 866 1 867 0 868 0 869 1 870 0 871 1 872 0 873 0 874 1 875 1 876 0 877 0 878 0 879 1 880 1 881 0 882 0 883 0 884 0 885 0 886 0 887 1 888 0 889 1 890 0 Name: Survived, dtype: int64
# Passengers that survived vs passengers that passed away
train["Survived"].value_counts()
0 549 1 342 Name: Survived, dtype: int64
# As proportions
train["Survived"].value_counts(normalize = True)
0 0.616162 1 0.383838 Name: Survived, dtype: float64
train['Sex'].value_counts()
male 577 female 314 Name: Sex, dtype: int64
train[train['Sex']=='female']#[train['Pclass'] == 3]
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
8 | 8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
10 | 10 | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.0 | 1 | 1 | PP 9549 | 16.7000 | G6 | S |
11 | 11 | 12 | 1 | 1 | Bonnell, Miss. Elizabeth | female | 58.0 | 0 | 0 | 113783 | 26.5500 | C103 | S |
14 | 14 | 15 | 0 | 3 | Vestrom, Miss. Hulda Amanda Adolfina | female | 14.0 | 0 | 0 | 350406 | 7.8542 | NaN | S |
15 | 15 | 16 | 1 | 2 | Hewlett, Mrs. (Mary D Kingcome) | female | 55.0 | 0 | 0 | 248706 | 16.0000 | NaN | S |
18 | 18 | 19 | 0 | 3 | Vander Planke, Mrs. Julius (Emelia Maria Vande... | female | 31.0 | 1 | 0 | 345763 | 18.0000 | NaN | S |
19 | 19 | 20 | 1 | 3 | Masselmani, Mrs. Fatima | female | NaN | 0 | 0 | 2649 | 7.2250 | NaN | C |
22 | 22 | 23 | 1 | 3 | McGowan, Miss. Anna "Annie" | female | 15.0 | 0 | 0 | 330923 | 8.0292 | NaN | Q |
24 | 24 | 25 | 0 | 3 | Palsson, Miss. Torborg Danira | female | 8.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
25 | 25 | 26 | 1 | 3 | Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... | female | 38.0 | 1 | 5 | 347077 | 31.3875 | NaN | S |
28 | 28 | 29 | 1 | 3 | O'Dwyer, Miss. Ellen "Nellie" | female | NaN | 0 | 0 | 330959 | 7.8792 | NaN | Q |
31 | 31 | 32 | 1 | 1 | Spencer, Mrs. William Augustus (Marie Eugenie) | female | NaN | 1 | 0 | PC 17569 | 146.5208 | B78 | C |
32 | 32 | 33 | 1 | 3 | Glynn, Miss. Mary Agatha | female | NaN | 0 | 0 | 335677 | 7.7500 | NaN | Q |
38 | 38 | 39 | 0 | 3 | Vander Planke, Miss. Augusta Maria | female | 18.0 | 2 | 0 | 345764 | 18.0000 | NaN | S |
39 | 39 | 40 | 1 | 3 | Nicola-Yarred, Miss. Jamila | female | 14.0 | 1 | 0 | 2651 | 11.2417 | NaN | C |
40 | 40 | 41 | 0 | 3 | Ahlin, Mrs. Johan (Johanna Persdotter Larsson) | female | 40.0 | 1 | 0 | 7546 | 9.4750 | NaN | S |
41 | 41 | 42 | 0 | 2 | Turpin, Mrs. William John Robert (Dorothy Ann ... | female | 27.0 | 1 | 0 | 11668 | 21.0000 | NaN | S |
43 | 43 | 44 | 1 | 2 | Laroche, Miss. Simonne Marie Anne Andree | female | 3.0 | 1 | 2 | SC/Paris 2123 | 41.5792 | NaN | C |
44 | 44 | 45 | 1 | 3 | Devaney, Miss. Margaret Delia | female | 19.0 | 0 | 0 | 330958 | 7.8792 | NaN | Q |
47 | 47 | 48 | 1 | 3 | O'Driscoll, Miss. Bridget | female | NaN | 0 | 0 | 14311 | 7.7500 | NaN | Q |
49 | 49 | 50 | 0 | 3 | Arnold-Franchi, Mrs. Josef (Josefine Franchi) | female | 18.0 | 1 | 0 | 349237 | 17.8000 | NaN | S |
52 | 52 | 53 | 1 | 1 | Harper, Mrs. Henry Sleeper (Myna Haxtun) | female | 49.0 | 1 | 0 | PC 17572 | 76.7292 | D33 | C |
53 | 53 | 54 | 1 | 2 | Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkin... | female | 29.0 | 1 | 0 | 2926 | 26.0000 | NaN | S |
56 | 56 | 57 | 1 | 2 | Rugg, Miss. Emily | female | 21.0 | 0 | 0 | C.A. 31026 | 10.5000 | NaN | S |
58 | 58 | 59 | 1 | 2 | West, Miss. Constance Mirium | female | 5.0 | 1 | 2 | C.A. 34651 | 27.7500 | NaN | S |
61 | 61 | 62 | 1 | 1 | Icard, Miss. Amelie | female | 38.0 | 0 | 0 | 113572 | 80.0000 | B28 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
807 | 807 | 808 | 0 | 3 | Pettersson, Miss. Ellen Natalia | female | 18.0 | 0 | 0 | 347087 | 7.7750 | NaN | S |
809 | 809 | 810 | 1 | 1 | Chambers, Mrs. Norman Campbell (Bertha Griggs) | female | 33.0 | 1 | 0 | 113806 | 53.1000 | E8 | S |
813 | 813 | 814 | 0 | 3 | Andersson, Miss. Ebba Iris Alfrida | female | 6.0 | 4 | 2 | 347082 | 31.2750 | NaN | S |
816 | 816 | 817 | 0 | 3 | Heininen, Miss. Wendla Maria | female | 23.0 | 0 | 0 | STON/O2. 3101290 | 7.9250 | NaN | S |
820 | 820 | 821 | 1 | 1 | Hays, Mrs. Charles Melville (Clara Jennings Gr... | female | 52.0 | 1 | 1 | 12749 | 93.5000 | B69 | S |
823 | 823 | 824 | 1 | 3 | Moor, Mrs. (Beila) | female | 27.0 | 0 | 1 | 392096 | 12.4750 | E121 | S |
829 | 829 | 830 | 1 | 1 | Stone, Mrs. George Nelson (Martha Evelyn) | female | 62.0 | 0 | 0 | 113572 | 80.0000 | B28 | NaN |
830 | 830 | 831 | 1 | 3 | Yasbeck, Mrs. Antoni (Selini Alexander) | female | 15.0 | 1 | 0 | 2659 | 14.4542 | NaN | C |
835 | 835 | 836 | 1 | 1 | Compton, Miss. Sara Rebecca | female | 39.0 | 1 | 1 | PC 17756 | 83.1583 | E49 | C |
842 | 842 | 843 | 1 | 1 | Serepeca, Miss. Augusta | female | 30.0 | 0 | 0 | 113798 | 31.0000 | NaN | C |
849 | 849 | 850 | 1 | 1 | Goldenberg, Mrs. Samuel L (Edwiga Grabowska) | female | NaN | 1 | 0 | 17453 | 89.1042 | C92 | C |
852 | 852 | 853 | 0 | 3 | Boulos, Miss. Nourelain | female | 9.0 | 1 | 1 | 2678 | 15.2458 | NaN | C |
853 | 853 | 854 | 1 | 1 | Lines, Miss. Mary Conover | female | 16.0 | 0 | 1 | PC 17592 | 39.4000 | D28 | S |
854 | 854 | 855 | 0 | 2 | Carter, Mrs. Ernest Courtenay (Lilian Hughes) | female | 44.0 | 1 | 0 | 244252 | 26.0000 | NaN | S |
855 | 855 | 856 | 1 | 3 | Aks, Mrs. Sam (Leah Rosen) | female | 18.0 | 0 | 1 | 392091 | 9.3500 | NaN | S |
856 | 856 | 857 | 1 | 1 | Wick, Mrs. George Dennick (Mary Hitchcock) | female | 45.0 | 1 | 1 | 36928 | 164.8667 | NaN | S |
858 | 858 | 859 | 1 | 3 | Baclini, Mrs. Solomon (Latifa Qurban) | female | 24.0 | 0 | 3 | 2666 | 19.2583 | NaN | C |
862 | 862 | 863 | 1 | 1 | Swift, Mrs. Frederick Joel (Margaret Welles Ba... | female | 48.0 | 0 | 0 | 17466 | 25.9292 | D17 | S |
863 | 863 | 864 | 0 | 3 | Sage, Miss. Dorothy Edith "Dolly" | female | NaN | 8 | 2 | CA. 2343 | 69.5500 | NaN | S |
865 | 865 | 866 | 1 | 2 | Bystrom, Mrs. (Karolina) | female | 42.0 | 0 | 0 | 236852 | 13.0000 | NaN | S |
866 | 866 | 867 | 1 | 2 | Duran y More, Miss. Asuncion | female | 27.0 | 1 | 0 | SC/PARIS 2149 | 13.8583 | NaN | C |
871 | 871 | 872 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47.0 | 1 | 1 | 11751 | 52.5542 | D35 | S |
874 | 874 | 875 | 1 | 2 | Abelson, Mrs. Samuel (Hannah Wizosky) | female | 28.0 | 1 | 0 | P/PP 3381 | 24.0000 | NaN | C |
875 | 875 | 876 | 1 | 3 | Najib, Miss. Adele Kiamie "Jane" | female | 15.0 | 0 | 0 | 2667 | 7.2250 | NaN | C |
879 | 879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
880 | 880 | 881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25.0 | 0 | 1 | 230433 | 26.0000 | NaN | S |
882 | 882 | 883 | 0 | 3 | Dahlberg, Miss. Gerda Ulrika | female | 22.0 | 0 | 0 | 7552 | 10.5167 | NaN | S |
885 | 885 | 886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.0 | 0 | 5 | 382652 | 29.1250 | NaN | Q |
887 | 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
314 rows × 13 columns
# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male']
0 0 4 0 5 0 6 0 7 0 12 0 13 0 16 0 17 1 20 0 21 1 23 1 26 0 27 0 29 0 30 0 33 0 34 0 35 0 36 1 37 0 42 0 45 0 46 0 48 0 50 0 51 0 54 0 55 1 57 0 .. 840 0 841 0 843 0 844 0 845 0 846 0 847 0 848 0 850 0 851 0 857 1 859 0 860 0 861 0 864 0 867 0 868 0 869 1 870 0 872 0 873 0 876 0 877 0 878 0 881 0 883 0 884 0 886 0 889 1 890 0 Name: Survived, dtype: int64
# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male'].value_counts()
0 468 1 109 Name: Survived, dtype: int64
# Females that survived vs Females that passed away
train["Survived"][train["Sex"] == 'female'].value_counts()
1 233 0 81 Name: Survived, dtype: int64
# Normalized male survival
train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
0 0.811092 1 0.188908 Name: Survived, dtype: float64
# Normalized female survival
train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
1 0.742038 0 0.257962 Name: Survived, dtype: float64
# Create the column Child, and indicate whether child or not a child. Print the new column.
train["Child"] = float('NaN')
train.Child[train.Age < 5] = 1
train.Child[train.Age >= 5] = 0
print(train.Child[:3])
0 0.0 1 0.0 2 0.0 Name: Child, dtype: float64
/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance() /Users/datalab/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# Normalized Survival Rates for under 18
train.Survived[train.Child == 1].value_counts(normalize = True)
1 0.675 0 0.325 Name: Survived, dtype: float64
# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize = True)
0 0.618968 1 0.381032 Name: Survived, dtype: float64
# Create a copy of test: test_one
test_one = test
# Initialize a Survived column to 0
test_one['Survived'] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.Survived[test_one.Sex =='female'] = 1
print(test_one.Survived[:3])
0 0 1 1 2 0 Name: Survived, dtype: int64
/Users/datalab/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from ipykernel import kernelapp as app /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance() /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
df = pd.read_csv('../data/tianya_bbs_threads_list.txt',\
sep = "\t", names = ['title','link', \
'author','author_page',\
'click','reply','time'])
df[:2]
title | link | author | author_page | click | reply | time | |
---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41 |
da = pd.read_csv('../data/tianya_bbs_threads_author_info.txt',
sep = "\t", names = ['author_page','followed_num',\
'fans_num','post_num', \
'comment_num'])
da[:2]
author_page | followed_num | fans_num | post_num | comment_num | |
---|---|---|---|---|---|
0 | http://www.tianya.cn/50499450 | 152 | 27452 | 1020 | 1341 |
1 | http://www.tianya.cn/74341835 | 0 | 1 | 2 | 5 |
data = pd.concat([df,da], axis=1)
len(data)
467
data[:3]
title | link | author | author_page | click | reply | time | author_page | followed_num | fans_num | post_num | comment_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 【民间语文第161期】宁波px启示:船进港湾人应上岸 | /post-free-2849477-1.shtml | 贾也 | http://www.tianya.cn/50499450 | 194675 | 2703 | 2012-10-29 07:59 | http://www.tianya.cn/50499450 | 152 | 27452 | 1020 | 1341 |
1 | 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) | /post-free-2839539-1.shtml | 无上卫士ABC | http://www.tianya.cn/74341835 | 88244 | 1041 | 2012-10-24 12:41 | http://www.tianya.cn/74341835 | 0 | 1 | 2 | 5 |
2 | 宁波准备停止PX项目了,元芳,你怎么看? | /post-free-2848797-1.shtml | 牧阳光 | http://www.tianya.cn/36535656 | 82779 | 625 | 2012-10-28 19:11 | http://www.tianya.cn/36535656 | 19 | 28 | 816 | 1268 |
type(data.time[0])
str
# extract date from datetime
# date = map(lambda x: x[:10], data.time)
date = [i[:10] for i in data.time]
#date = [i[:10] for i in data.time]
data['date'] = pd.to_datetime(date)
# convert str to datetime format
data.time = pd.to_datetime(data.time)
data['month'] = data.time.dt.month
data['year'] = data.time.dt.year
data['day'] = data.time.dt.day
type(data.time[0])
pandas.tslib.Timestamp
data.describe()
#data.head()
click | reply | month | year | day | |
---|---|---|---|---|---|
count | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 |
mean | 1534.957173 | 18.907923 | 7.432548 | 2012.620985 | 17.961456 |
std | 11099.249834 | 144.869921 | 3.084860 | 1.795269 | 9.491730 |
min | 11.000000 | 0.000000 | 1.000000 | 2006.000000 | 1.000000 |
25% | 42.500000 | 0.000000 | 5.000000 | 2013.000000 | 8.000000 |
50% | 84.000000 | 0.000000 | 6.000000 | 2013.000000 | 23.000000 |
75% | 322.000000 | 4.000000 | 11.000000 | 2013.000000 | 25.000000 |
max | 194675.000000 | 2703.000000 | 12.000000 | 2015.000000 | 31.000000 |
http://statsmodels.sourceforge.net/
Statsmodels is a Python module that allows users to explore data, estimate statistical models, and perform statistical tests.
An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator.
Researchers across fields may find that statsmodels fully meets their needs for statistical computing and data analysis in Python.
import statsmodels.api as sm
' '.join(dir(sm))
'GEE GLM GLS GLSAR Logit MNLogit MixedLM NegativeBinomial NominalGEE OLS OrdinalGEE PHReg Poisson ProbPlot Probit QuantReg RLM WLS __builtins__ __cached__ __doc__ __file__ __loader__ __name__ __package__ __spec__ add_constant categorical cov_struct datasets distributions emplike families formula genmod graphics iolib load nonparametric qqline qqplot qqplot_2samples regression robust show_versions stats test tools tsa version webdoc'
' '.join(dir(sm.stats))
'CompareCox CompareJ CompareMeans DescrStatsW Describe FTestAnovaPower FTestPower GofChisquarePower HetGoldfeldQuandt NormalIndPower Runs TTestIndPower TTestPower __builtins__ __cached__ __doc__ __file__ __loader__ __name__ __package__ __spec__ acorr_breush_godfrey acorr_ljungbox anova_lm binom_test binom_test_reject_interval binom_tost binom_tost_reject_interval breaks_cusumolsresid breaks_hansen chisquare_effectsize cochrans_q compare_cox compare_j corr_clipped corr_nearest cov_cluster cov_cluster_2groups cov_hac cov_hc0 cov_hc1 cov_hc2 cov_hc3 cov_nearest cov_nw_panel cov_white_simple diagnostic durbin_watson fdrcorrection fdrcorrection_twostage gof gof_chisquare_discrete het_arch het_breushpagan het_goldfeldquandt het_white jarque_bera lillifors linear_harvey_collier linear_lm linear_rainbow mcnemar moment_helpers multicomp multipletests normal_ad omni_normtest power_binom_tost power_ztost_prop powerdiscrepancy proportion_confint proportion_effectsize proportions_chisquare proportions_chisquare_allpairs proportions_chisquare_pairscontrol proportions_ztest proportions_ztost recursive_olsresiduals runstest_1samp runstest_2samp sandwich_covariance se_cov stattools symmetry_bowker tt_ind_solve_power tt_solve_power ttest_ind ttost_ind ttost_paired tukeyhsd unitroot_adf zconfint zt_ind_solve_power ztest ztost'
data.describe()
click | reply | month | year | day | |
---|---|---|---|---|---|
count | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 |
mean | 1534.957173 | 18.907923 | 7.432548 | 2012.620985 | 17.961456 |
std | 11099.249834 | 144.869921 | 3.084860 | 1.795269 | 9.491730 |
min | 11.000000 | 0.000000 | 1.000000 | 2006.000000 | 1.000000 |
25% | 42.500000 | 0.000000 | 5.000000 | 2013.000000 | 8.000000 |
50% | 84.000000 | 0.000000 | 6.000000 | 2013.000000 | 23.000000 |
75% | 322.000000 | 4.000000 | 11.000000 | 2013.000000 | 25.000000 |
max | 194675.000000 | 2703.000000 | 12.000000 | 2015.000000 | 31.000000 |
import numpy as np
np.mean(data.click), np.std(data.click), np.sum(data.click)
(1534.9571734475376, 11087.35990002894, 716825)
# 不加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, \
weights=[1 for i in data.click])
d1.mean, d1.var, d1.std, d1.sum
(1534.9571734475376, 122929549.55276974, 11087.35990002894, 716825.0)
# 加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=data.reply)
d1.mean, d1.var, d1.std, d1.sum
(83335.963986409959, 6297145701.6868114, 79354.556905617035, 735856562.0)
np.median(data.click) # np.percentile
84.0
plt.hist(data.click)
plt.show()
plt.hist(data.reply, color = 'green')
plt.show()
plt.hist(np.log(data.click+1), color='green')
plt.hist(np.log(data.reply+1), color='red')
plt.show()
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1)])
plt.show()
# Plot the height and weight to see
plt.boxplot([data.click, data.reply])
plt.show()
def transformData(dat):
results = []
for i in dat:
if i != 'na':
results.append( int(i))
else:
results.append(0)
return results
data.fans_num = transformData(data.fans_num)
data.followed_num = transformData(data.followed_num )
data.post_num = transformData(data.post_num )
data.comment_num = transformData(data.comment_num )
data.describe()
click | reply | followed_num | fans_num | post_num | comment_num | month | year | day | |
---|---|---|---|---|---|---|---|---|---|
count | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 | 467.000000 |
mean | 1534.957173 | 18.907923 | 15.713062 | 839.421842 | 146.336188 | 434.556745 | 7.432548 | 2012.620985 | 17.961456 |
std | 11099.249834 | 144.869921 | 120.221465 | 7589.853870 | 577.441999 | 1989.458332 | 3.084860 | 1.795269 | 9.491730 |
min | 11.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 1.000000 |
25% | 42.500000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 0.000000 | 5.000000 | 2013.000000 | 8.000000 |
50% | 84.000000 | 0.000000 | 0.000000 | 1.000000 | 16.000000 | 9.000000 | 6.000000 | 2013.000000 | 23.000000 |
75% | 322.000000 | 4.000000 | 1.000000 | 4.000000 | 84.000000 | 88.000000 | 11.000000 | 2013.000000 | 25.000000 |
max | 194675.000000 | 2703.000000 | 1817.000000 | 108449.000000 | 10684.000000 | 24848.000000 | 12.000000 | 2015.000000 | 31.000000 |
import numpy as np
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1), np.log(data.reply+1),
np.log(data.fans_num+1),\
np.log(data.followed_num + 1)],
labels = ['$Click$', '$Reply$', '$Fans$',\
'$Followed$'])
plt.show()
fig = plt.figure(figsize=(12,4))
data.boxplot(return_type='dict')
plt.yscale('log')
plt.show()
' '.join(dir(data)[200:])
'_typ _unpickle_frame_compat _unpickle_matrix_compat _update_inplace _validate_dtype _values _xs abs add add_prefix add_suffix align all any append apply applymap as_blocks as_matrix asfreq assign astype at at_time author author_page axes between_time bfill blocks bool boxplot click clip clip_lower clip_upper columns combine combineAdd combineMult combine_first comment_num compound consolidate convert_objects copy corr corrwith count cov cummax cummin cumprod cumsum date day describe diff div divide dot drop drop_duplicates dropna dtypes duplicated empty eq equals eval ewm expanding fans_num ffill fillna filter first first_valid_index floordiv followed_num from_csv from_dict from_items from_records ftypes ge get get_dtype_counts get_ftype_counts get_value get_values groupby gt head hist iat icol idxmax idxmin iget_value iloc index info insert interpolate irow is_copy isin isnull iteritems iterkv iterrows itertuples ix join keys kurt kurtosis last last_valid_index le link loc lookup lt mad mask max mean median memory_usage merge min mod mode month mul multiply ndim ne nlargest notnull nsmallest pct_change pipe pivot pivot_table plot pop post_num pow prod product quantile query radd rank rdiv reindex reindex_axis reindex_like rename rename_axis reorder_levels replace reply resample reset_index rfloordiv rmod rmul rolling round rpow rsub rtruediv sample select select_dtypes sem set_axis set_index set_value shape shift size skew slice_shift sort sort_index sort_values sortlevel squeeze stack std style sub subtract sum swapaxes swaplevel tail take time title to_clipboard to_csv to_dense to_dict to_excel to_gbq to_hdf to_html to_json to_latex to_msgpack to_panel to_period to_pickle to_records to_sparse to_sql to_stata to_string to_timestamp to_wide to_xarray transpose truediv truncate tshift tz_convert tz_localize unstack update values var where xs year'
from pandas.tools import plotting
# fig = plt.figure(figsize=(10, 10))
plotting.scatter_matrix(data[['click', 'reply',\
'post_num','comment_num']])
plt.show()
' '.join(dir(plotting))
'AbstractMethodError Appender AreaPlot BarPlot BarhPlot BasePlotMethods BoxPlot FramePlotMethods HexBinPlot HistPlot Index KdePlot LinePlot LooseVersion MPLPlot MultiIndex PandasObject PeriodIndex PiePlot PlanePlot ScatterPlot Series SeriesPlotMethods _Options __builtins__ __doc__ __file__ __name__ __package__ _all_kinds _common_kinds _dataframe_kinds _flatten _gca _gcf _get_all_lines _get_layout _get_marker_compat _get_standard_colors _get_standard_kind _get_xlim _grouped_plot _grouped_plot_by_column _handle_shared_axes _klasses _mpl_ge_1_3_1 _mpl_ge_1_4_0 _mpl_ge_1_5_0 _mpl_le_1_2_1 _plot _plot_klass _remove_labels_from_axis _series_kinds _set_ticks_props _shared_doc_df_kwargs _shared_doc_kwargs _shared_doc_series_kwargs _shared_docs _subplots andrews_curves autocorrelation_plot bootstrap_plot boxplot boxplot_frame_groupby cache_readonly ceil colors com compat contextmanager conv cycler deprecate_kwarg df_ax df_coord df_kind df_note df_unique format_date_labels grouped_hist hist_frame hist_series klass lag_plot lmap lrange map mpl_stylesheet namedtuple np parallel_coordinates plot_frame plot_params plot_series radviz range re remove_na scatter_matrix scatter_plot series_ax series_coord series_kind series_note series_unique string_types table warnings zip'
http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html
import seaborn # conda install seaborn
seaborn.pairplot(data, vars=['click', 'reply', \
'post_num', 'comment_num'],
kind='reg')
<seaborn.axisgrid.PairGrid at 0x119406898>
seaborn.pairplot(data, vars=['click', 'reply', 'post_num'],
kind='reg', hue='year')
<seaborn.axisgrid.PairGrid at 0x119640f60>
seaborn.lmplot(y='reply', x='click', data=data,
size = 5)
plt.show()
data.year.value_counts()
2013 304 2014 63 2007 34 2012 33 2015 20 2011 6 2009 6 2006 1 Name: year, dtype: int64
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
dd
year | |
---|---|
2006 | 1 |
2007 | 34 |
2009 | 6 |
2011 | 6 |
2012 | 33 |
2013 | 304 |
2014 | 63 |
2015 | 20 |
dd.index
Int64Index([2006, 2007, 2009, 2011, 2012, 2013, 2014, 2015], dtype='int64')
dd_date_str = list(map(lambda x: str(x) +'-01-01', dd.index))
dd_date_str
['2006-01-01', '2007-01-01', '2009-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01']
dd_date = pd.to_datetime(list(dd_date_str))
dd_date
DatetimeIndex(['2006-01-01', '2007-01-01', '2009-01-01', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01'], dtype='datetime64[ns]', freq=None)
plt.plot(dd_date, dd.year, 'r-o')
plt.show()
ds = dd.cumsum()
ds
year | |
---|---|
2006 | 1 |
2007 | 35 |
2009 | 41 |
2011 | 47 |
2012 | 80 |
2013 | 384 |
2014 | 447 |
2015 | 467 |
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()
def getDate(dat):
dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index))
dat_date = pd.to_datetime(dat_date_str)
return dat_date
ds.date = getDate(ds)
dd.date = getDate(dd)
plt.plot(ds.date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd.date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
dg = data.groupby('year').sum()
dg
click | reply | followed_num | fans_num | post_num | comment_num | month | day | |
---|---|---|---|---|---|---|---|---|
year | ||||||||
2006 | 1214 | 24 | 0 | 2 | 278 | 291 | 8 | 24 |
2007 | 28290 | 514 | 22 | 137 | 8041 | 10344 | 281 | 512 |
2009 | 18644 | 186 | 17 | 12 | 531 | 571 | 39 | 78 |
2011 | 2889 | 28 | 84 | 28 | 332 | 661 | 50 | 72 |
2012 | 463720 | 5933 | 2779 | 59511 | 12315 | 32498 | 322 | 819 |
2013 | 63140 | 937 | 571 | 43265 | 24359 | 40362 | 2458 | 6111 |
2014 | 57764 | 772 | 2216 | 16664 | 11266 | 98025 | 233 | 579 |
2015 | 81164 | 436 | 1649 | 272391 | 11217 | 20186 | 80 | 193 |
dgs = dg.cumsum()
dgs
click | reply | followed_num | fans_num | post_num | comment_num | month | day | |
---|---|---|---|---|---|---|---|---|
year | ||||||||
2006 | 1214 | 24 | 0 | 2 | 278 | 291 | 8 | 24 |
2007 | 29504 | 538 | 22 | 139 | 8319 | 10635 | 289 | 536 |
2009 | 48148 | 724 | 39 | 151 | 8850 | 11206 | 328 | 614 |
2011 | 51037 | 752 | 123 | 179 | 9182 | 11867 | 378 | 686 |
2012 | 514757 | 6685 | 2902 | 59690 | 21497 | 44365 | 700 | 1505 |
2013 | 577897 | 7622 | 3473 | 102955 | 45856 | 84727 | 3158 | 7616 |
2014 | 635661 | 8394 | 5689 | 119619 | 57122 | 182752 | 3391 | 8195 |
2015 | 716825 | 8830 | 7338 | 392010 | 68339 | 202938 | 3471 | 8388 |
def getDate(dat):
dat_date_str = list(map(lambda x: str(x) +'-01-01', dat.index))
dat_date = pd.to_datetime(dat_date_str)
return dat_date
dg.date = getDate(dg)
fig = plt.figure(figsize=(12,5))
plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$')
plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$')
plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$')
plt.yscale('log')
plt.legend(loc=4,numpoints=1,fontsize=13)
plt.show()
data.groupby('year')['click'].sum()
year 2006 1214 2007 28290 2009 18644 2011 2889 2012 463720 2013 63140 2014 57764 2015 81164 Name: click, dtype: int64
data.groupby('year')['click'].mean()
year 2006 1214.000000 2007 832.058824 2009 3107.333333 2011 481.500000 2012 14052.121212 2013 207.697368 2014 916.888889 2015 4058.200000 Name: click, dtype: float64
repost = []
for i in df.title:
if u'转载' in i:
repost.append(1)
else:
repost.append(0)
df['repost'] = repost
df.groupby('repost').mean()
click | reply | |
---|---|---|
repost | ||
0 | 2884.381720 | 33.376344 |
1 | 641.743772 | 9.330961 |
df['click'][df['repost']==0][:5]
0 194675 2 82779 3 45304 5 27026 6 24026 Name: click, dtype: int64
df['click'][df['repost']==1][:5]
1 88244 4 38132 13 4990 16 3720 18 3421 Name: click, dtype: int64
from scipy import stats
stats.ttest_ind(data.click, data.repost)
Ttest_indResult(statistic=2.9514887561591618, pvalue=0.0032417014839700789)
sm.stats.ttest_ind(data.click, data.reply)
# test statistic, pvalue and degrees of freedom
(2.9514887561591618, 0.0032417014839700789, 932.0)
https://en.wikipedia.org/wiki/Chi-squared_test
reject the null hypothesis that the data are independent
.assumption of independent normally distributed data
, which is valid in many cases due to the central limit theorem
. sum of squared errors
, or through the sample variance
.Suppose there is a city of 1 million residents with four neighborhoods
: A, B, C, and D.
A random sample of 650 residents of the city is taken and their occupation is recorded as "blue collar", "white collar", or "no collar".
The null hypothesis
is that each person's neighborhood of residence is independent of the person's occupational classification. The data are tabulated as:
A | B | C | D | Total | |
---|---|---|---|---|---|
White collar | 90 | 60 | 104 | 95 | 349 |
Blue collar | 30 | 50 | 51 | 20 | 151 |
No coloar | 30 | 40 | 45 | 35 | 150 |
Total | 150 | 150 | 200 | 150 | 650 |
white-collar workers in neighborhood A
to be$ \frac{150}{650} \frac{349}{650} 650 = 80.54 $
Then in that "cell" of the table, we have
$\frac{(\text{observed}-\text{expected})^2}{\text{expected}} = \frac{(90-80.54)^2}{80.54}$.
The sum of these quantities over all of the cells is the test statistic
.
Under the null hypothesis, it has approximately a chi-square distribution whose number of degrees of freedom
are
$ (\text{number of rows}-1)(\text{number of columns}-1) = (3-1)(4-1) = 6. $
If the test statistic is improbably large according to that chi-square distribution, then one rejects the null hypothesis of independence.
Parameters:
from scipy.stats import chisquare
chisquare([16, 18, 16, 14, 12, 12], \
f_exp=[16, 16, 16, 16, 16, 8])
Power_divergenceResult(statistic=3.5, pvalue=0.62338762774958223)
from scipy.stats import chi2
# p_value = chi2.sf(chi_statistic, df)
chi2.sf(3.5, 5), 1 - chi2.cdf(3.5,5)
(0.62338762774958223, 0.62338762774958223)
print np.corrcoef(data.click, data.reply)
print np.corrcoef(np.log(data.click+1), \
np.log(data.reply+1))
[[ 1. 0.96396571] [ 0.96396571 1. ]] [[ 1. 0.77721397] [ 0.77721397 1. ]]
data.corr()
click | reply | followed_num | fans_num | post_num | comment_num | month | year | day | |
---|---|---|---|---|---|---|---|---|---|
click | 1.000000 | 0.963966 | 0.143595 | 0.158116 | 0.097502 | 0.085615 | 0.038788 | -0.024827 | 0.048361 |
reply | 0.963966 | 1.000000 | 0.199270 | 0.159387 | 0.090342 | 0.123341 | 0.040165 | -0.041208 | 0.058738 |
followed_num | 0.143595 | 0.199270 | 1.000000 | 0.407656 | 0.211677 | 0.499612 | -0.036037 | 0.051187 | -0.020604 |
fans_num | 0.158116 | 0.159387 | 0.407656 | 1.000000 | 0.341724 | 0.145387 | -0.084243 | 0.102301 | -0.045883 |
post_num | 0.097502 | 0.090342 | 0.211677 | 0.341724 | 1.000000 | 0.514695 | -0.070024 | -0.011786 | -0.033254 |
comment_num | 0.085615 | 0.123341 | 0.499612 | 0.145387 | 0.514695 | 1.000000 | -0.118703 | 0.069160 | -0.119840 |
month | 0.038788 | 0.040165 | -0.036037 | -0.084243 | -0.070024 | -0.118703 | 1.000000 | -0.236920 | 0.535354 |
year | -0.024827 | -0.041208 | 0.051187 | 0.102301 | -0.011786 | 0.069160 | -0.236920 | 1.000000 | -0.046699 |
day | 0.048361 | 0.058738 | -0.020604 | -0.045883 | -0.033254 | -0.119840 | 0.535354 | -0.046699 | 1.000000 |
plt.plot(df.click, df.reply, 'r-o')
plt.show()
plt.plot(df.click, df.reply, 'gs')
plt.xlabel('$Clicks$', fontsize = 20)
plt.ylabel('$Replies$', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.title('$Allowmetric\,Law$', fontsize = 20)
plt.show()
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', \
data=dat).fit()
有些使用windows的同学无法运行上述代码:
https://groups.google.com/forum/#!topic/pystatsmodels/KcSzNqDxv-Q
# Inspect the results
print results.summary()
OLS Regression Results ============================================================================== Dep. Variable: Lottery R-squared: 0.348 Model: OLS Adj. R-squared: 0.333 Method: Least Squares F-statistic: 22.20 Date: Sat, 16 Apr 2016 Prob (F-statistic): 1.90e-08 Time: 23:39:42 Log-Likelihood: -379.82 No. Observations: 86 AIC: 765.6 Df Residuals: 83 BIC: 773.0 Df Model: 2 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [95.0% Conf. Int.] ----------------------------------------------------------------------------------- Intercept 246.4341 35.233 6.995 0.000 176.358 316.510 Literacy -0.4889 0.128 -3.832 0.000 -0.743 -0.235 np.log(Pop1831) -31.3114 5.977 -5.239 0.000 -43.199 -19.424 ============================================================================== Omnibus: 3.713 Durbin-Watson: 2.019 Prob(Omnibus): 0.156 Jarque-Bera (JB): 3.394 Skew: -0.487 Prob(JB): 0.183 Kurtosis: 3.003 Cond. No. 702. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
reg = smf.ols('reply ~ click + followed_num', \
data=data).fit()
print reg.summary()
OLS Regression Results ============================================================================== Dep. Variable: reply R-squared: 0.933 Model: OLS Adj. R-squared: 0.933 Method: Least Squares F-statistic: 3231. Date: Sun, 17 Apr 2016 Prob (F-statistic): 4.30e-273 Time: 02:04:27 Log-Likelihood: -2354.7 No. Observations: 467 AIC: 4715. Df Residuals: 464 BIC: 4728. Df Model: 2 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [95.0% Conf. Int.] -------------------------------------------------------------------------------- Intercept -1.4024 1.766 -0.794 0.428 -4.873 2.068 click 0.0125 0.000 78.660 0.000 0.012 0.013 followed_num 0.0749 0.015 5.117 0.000 0.046 0.104 ============================================================================== Omnibus: 374.515 Durbin-Watson: 1.938 Prob(Omnibus): 0.000 Jarque-Bera (JB): 97373.297 Skew: -2.416 Prob(JB): 0.00 Kurtosis: 73.575 Cond. No. 1.14e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.14e+04. This might indicate that there are strong multicollinearity or other numerical problems.
reg1 = smf.ols('np.log(reply+1) ~ np.log(click+1) \
+np.log(followed_num+1)+month', data=data).fit()
print reg1.summary()
OLS Regression Results ============================================================================== Dep. Variable: np.log(reply + 1) R-squared: 0.606 Model: OLS Adj. R-squared: 0.603 Method: Least Squares F-statistic: 236.9 Date: Sun, 14 May 2017 Prob (F-statistic): 4.03e-93 Time: 11:06:31 Log-Likelihood: -596.73 No. Observations: 467 AIC: 1201. Df Residuals: 463 BIC: 1218. Df Model: 3 Covariance Type: nonrobust ============================================================================================ coef std err t P>|t| [95.0% Conf. Int.] -------------------------------------------------------------------------------------------- Intercept -2.6009 0.189 -13.778 0.000 -2.972 -2.230 np.log(click + 1) 0.6872 0.029 24.083 0.000 0.631 0.743 np.log(followed_num + 1) 0.0118 0.034 0.347 0.729 -0.055 0.079 month 0.0172 0.013 1.275 0.203 -0.009 0.044 ============================================================================== Omnibus: 26.408 Durbin-Watson: 1.904 Prob(Omnibus): 0.000 Jarque-Bera (JB): 44.572 Skew: -0.389 Prob(JB): 2.10e-10 Kurtosis: 4.299 Cond. No. 44.1 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(reg1, fig = fig)
plt.show()
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore = sm.datasets.get_rdataset("Moore", "car",
cache=True) # load data
data = moore.data
data = data.rename(columns={"partner.status" :
"partner_status"}) # make name pythonic
data[:5]
partner_status | conformity | fcategory | fscore | |
---|---|---|---|---|
0 | low | 8 | low | 37 |
1 | low | 4 | high | 57 |
2 | low | 8 | high | 65 |
3 | low | 7 | low | 20 |
4 | low | 10 | low | 36 |
moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
data=data).fit()
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(moore_lm, fig = fig)
plt.show()
table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame
print table
sum_sq df F \ C(fcategory, Sum) 11.614700 2.0 0.276958 C(partner_status, Sum) 212.213778 1.0 10.120692 C(fcategory, Sum):C(partner_status, Sum) 175.488928 2.0 4.184623 Residual 817.763961 39.0 NaN PR(>F) C(fcategory, Sum) 0.759564 C(partner_status, Sum) 0.002874 C(fcategory, Sum):C(partner_status, Sum) 0.022572 Residual NaN