#from the:
http://www.listendata.com/2016/08/dplyr-tutorial.html
# dplyr package was written by the most popular R programmer Hadley Wickham who has written many
# useful R packages such as ggplot2, tidyr etc.
# Two azaming hire for MST would be
1. Hadley Wickham -> R Fame
2. Wes McKinney -> python panda Fame
install.packages("dplyr")
Installing package into 'C:/Users/v-thbeta/Documents/R/win-library/3.3' (as 'lib' is unspecified) also installing the dependencies 'assertthat', 'tibble', 'lazyeval', 'DBI', 'BH'
package 'assertthat' successfully unpacked and MD5 sums checked package 'tibble' successfully unpacked and MD5 sums checked package 'lazyeval' successfully unpacked and MD5 sums checked package 'DBI' successfully unpacked and MD5 sums checked package 'BH' successfully unpacked and MD5 sums checked package 'dplyr' successfully unpacked and MD5 sums checked The downloaded binary packages are in C:\Users\v-thbeta\AppData\Local\Temp\RtmpiObDvN\downloaded_packages
library(dplyr)
Attaching package: 'dplyr' The following objects are masked from 'package:stats': filter, lag The following objects are masked from 'package:base': intersect, setdiff, setequal, union
df <- read.csv("c:\\Users\\v-thbeta\\Desktop\\examples\\data\\states_income\\sampledata.csv")
glimpse(df)
Observations: 51 Variables: 16 $ Index <fctr> A, A, A, A, C, C, C, D, D, F, G, H, I, I, I, I, K, K, L, M, ... $ State <fctr> Alabama, Alaska, Arizona, Arkansas, California, Colorado, Co... $ Y2002 <int> 1296530, 1170302, 1742027, 1485531, 1685349, 1343824, 1610512... $ Y2003 <int> 1317711, 1960378, 1968140, 1994927, 1675807, 1878473, 1232844... $ Y2004 <int> 1118631, 1818085, 1377583, 1119299, 1889570, 1886149, 1181949... $ Y2005 <int> 1492583, 1447852, 1782199, 1947979, 1480280, 1236697, 1518933... $ Y2006 <int> 1107408, 1861639, 1102568, 1669191, 1735069, 1871471, 1841266... $ Y2007 <int> 1440134, 1465841, 1109382, 1801213, 1812546, 1814218, 1976976... $ Y2008 <int> 1945229, 1551826, 1752886, 1188104, 1487315, 1875146, 1764457... $ Y2009 <int> 1944173, 1436541, 1554330, 1628980, 1663809, 1752387, 1972730... $ Y2010 <int> 1237582, 1629616, 1300521, 1669295, 1624509, 1913275, 1968730... $ Y2011 <int> 1440756, 1230866, 1130709, 1928238, 1639670, 1665877, 1945524... $ Y2012 <int> 1186741, 1512804, 1907284, 1216675, 1921845, 1491604, 1228529... $ Y2013 <int> 1852841, 1985302, 1363279, 1591896, 1156536, 1178355, 1582249... $ Y2014 <int> 1558906, 1580394, 1525866, 1360959, 1388461, 1383978, 1503156... $ Y2015 <int> 1916661, 1979143, 1647724, 1329341, 1644607, 1330736, 1718072...
dim(df)
sample_n(df,5)
Index | State | Y2002 | Y2003 | Y2004 | Y2005 | Y2006 | Y2007 | Y2008 | Y2009 | Y2010 | Y2011 | Y2012 | Y2013 | Y2014 | Y2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6 | C | Colorado | 1343824 | 1878473 | 1886149 | 1236697 | 1871471 | 1814218 | 1875146 | 1752387 | 1913275 | 1665877 | 1491604 | 1178355 | 1383978 | 1330736 |
32 | N | New Mexico | 1819239 | 1226057 | 1935991 | 1124400 | 1723493 | 1475985 | 1237704 | 1820856 | 1801430 | 1653384 | 1475715 | 1623388 | 1533494 | 1868612 |
36 | O | Ohio | 1802132 | 1648498 | 1441386 | 1670280 | 1534888 | 1314824 | 1516621 | 1511460 | 1585465 | 1887714 | 1227303 | 1840898 | 1880804 | 1573117 |
45 | U | Utah | 1771096 | 1195861 | 1979395 | 1241662 | 1437456 | 1859416 | 1939284 | 1915865 | 1619186 | 1288285 | 1108281 | 1123353 | 1801019 | 1729273 |
41 | S | South Carolina | 1631522 | 1803455 | 1425193 | 1458191 | 1538731 | 1825195 | 1250499 | 1864685 | 1345102 | 1116203 | 1532332 | 1591735 | 1188417 | 1110655 |
sample_n(df,5)
Index | State | Y2002 | Y2003 | Y2004 | Y2005 | Y2006 | Y2007 | Y2008 | Y2009 | Y2010 | Y2011 | Y2012 | Y2013 | Y2014 | Y2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
35 | N | North Dakota | 1618807 | 1510193 | 1876940 | 1443172 | 1425030 | 1868788 | 1720352 | 1671468 | 1534571 | 1271132 | 1430978 | 1529024 | 1563898 | 1604118 |
34 | N | North Carolina | 1616742 | 1292223 | 1482792 | 1532347 | 1158716 | 1827420 | 1267737 | 1116168 | 1791535 | 1553750 | 1472258 | 1104893 | 1596452 | 1229085 |
18 | K | Kentucky | 1813878 | 1448846 | 1800760 | 1250524 | 1137913 | 1911227 | 1301848 | 1956681 | 1350895 | 1512894 | 1916616 | 1878271 | 1722762 | 1913350 |
28 | N | Nebraska | 1885081 | 1309769 | 1425527 | 1240465 | 1500594 | 1278272 | 1140598 | 1270585 | 1128711 | 1187207 | 1569665 | 1690920 | 1459243 | 1802211 |
6 | C | Colorado | 1343824 | 1878473 | 1886149 | 1236697 | 1871471 | 1814218 | 1875146 | 1752387 | 1913275 | 1665877 | 1491604 | 1178355 | 1383978 | 1330736 |
dedup = distinct(df)
str(dedup)
'data.frame': 51 obs. of 16 variables: $ Index: Factor w/ 19 levels "A","C","D","F",..: 1 1 1 1 2 2 2 3 3 4 ... $ State: Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ... $ Y2002: int 1296530 1170302 1742027 1485531 1685349 1343824 1610512 1330403 1111437 1964626 ... $ Y2003: int 1317711 1960378 1968140 1994927 1675807 1878473 1232844 1268673 1993741 1468852 ... $ Y2004: int 1118631 1818085 1377583 1119299 1889570 1886149 1181949 1706751 1374643 1419738 ... $ Y2005: int 1492583 1447852 1782199 1947979 1480280 1236697 1518933 1403759 1827949 1362787 ... $ Y2006: int 1107408 1861639 1102568 1669191 1735069 1871471 1841266 1441351 1803852 1339608 ... $ Y2007: int 1440134 1465841 1109382 1801213 1812546 1814218 1976976 1300836 1595981 1278550 ... $ Y2008: int 1945229 1551826 1752886 1188104 1487315 1875146 1764457 1762096 1193245 1756185 ... $ Y2009: int 1944173 1436541 1554330 1628980 1663809 1752387 1972730 1553585 1739748 1818438 ... $ Y2010: int 1237582 1629616 1300521 1669295 1624509 1913275 1968730 1370984 1707823 1198403 ... $ Y2011: int 1440756 1230866 1130709 1928238 1639670 1665877 1945524 1318669 1353449 1497051 ... $ Y2012: int 1186741 1512804 1907284 1216675 1921845 1491604 1228529 1984027 1979708 1131928 ... $ Y2013: int 1852841 1985302 1363279 1591896 1156536 1178355 1582249 1671279 1912654 1107448 ... $ Y2014: int 1558906 1580394 1525866 1360959 1388461 1383978 1503156 1803169 1782169 1407784 ... $ Y2015: int 1916661 1979143 1647724 1329341 1644607 1330736 1718072 1627508 1410183 1170389 ...
dedup2 = distinct(df, Index, .keep_all=TRUE)
str(dedup)
'data.frame': 51 obs. of 16 variables: $ Index: Factor w/ 19 levels "A","C","D","F",..: 1 1 1 1 2 2 2 3 3 4 ... $ State: Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ... $ Y2002: int 1296530 1170302 1742027 1485531 1685349 1343824 1610512 1330403 1111437 1964626 ... $ Y2003: int 1317711 1960378 1968140 1994927 1675807 1878473 1232844 1268673 1993741 1468852 ... $ Y2004: int 1118631 1818085 1377583 1119299 1889570 1886149 1181949 1706751 1374643 1419738 ... $ Y2005: int 1492583 1447852 1782199 1947979 1480280 1236697 1518933 1403759 1827949 1362787 ... $ Y2006: int 1107408 1861639 1102568 1669191 1735069 1871471 1841266 1441351 1803852 1339608 ... $ Y2007: int 1440134 1465841 1109382 1801213 1812546 1814218 1976976 1300836 1595981 1278550 ... $ Y2008: int 1945229 1551826 1752886 1188104 1487315 1875146 1764457 1762096 1193245 1756185 ... $ Y2009: int 1944173 1436541 1554330 1628980 1663809 1752387 1972730 1553585 1739748 1818438 ... $ Y2010: int 1237582 1629616 1300521 1669295 1624509 1913275 1968730 1370984 1707823 1198403 ... $ Y2011: int 1440756 1230866 1130709 1928238 1639670 1665877 1945524 1318669 1353449 1497051 ... $ Y2012: int 1186741 1512804 1907284 1216675 1921845 1491604 1228529 1984027 1979708 1131928 ... $ Y2013: int 1852841 1985302 1363279 1591896 1156536 1178355 1582249 1671279 1912654 1107448 ... $ Y2014: int 1558906 1580394 1525866 1360959 1388461 1383978 1503156 1803169 1782169 1407784 ... $ Y2015: int 1916661 1979143 1647724 1329341 1644607 1330736 1718072 1627508 1410183 1170389 ...
df2 = select(df, Index, State:Y2008, Y2015)
# Expecting to see column Y2015 as part of the results-set
str(df2)
'data.frame': 51 obs. of 9 variables: $ Index: Factor w/ 19 levels "A","C","D","F",..: 1 1 1 1 2 2 2 3 3 4 ... $ State: Factor w/ 51 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ... $ Y2002: int 1296530 1170302 1742027 1485531 1685349 1343824 1610512 1330403 1111437 1964626 ... $ Y2003: int 1317711 1960378 1968140 1994927 1675807 1878473 1232844 1268673 1993741 1468852 ... $ Y2004: int 1118631 1818085 1377583 1119299 1889570 1886149 1181949 1706751 1374643 1419738 ... $ Y2005: int 1492583 1447852 1782199 1947979 1480280 1236697 1518933 1403759 1827949 1362787 ... $ Y2006: int 1107408 1861639 1102568 1669191 1735069 1871471 1841266 1441351 1803852 1339608 ... $ Y2007: int 1440134 1465841 1109382 1801213 1812546 1814218 1976976 1300836 1595981 1278550 ... $ Y2008: int 1945229 1551826 1752886 1188104 1487315 1875146 1764457 1762096 1193245 1756185 ...
# A set of verbs for operations such as applying filter, selecting specific columns,
# sorting data, adding or deleting columns and aggregating data.