#install.packages("XLConnect", repos = "http://cran.us.r-project.org")
library(XLConnect) #lib.loc="C:/Users/Suman/Documents/R/win-library/3.4")
# Download an Excel file from the internet
# url <- "http://miraisolutions.files.wordpress.com/"
# path <- "2013/01/"
# file.name <- "example.xlsx"
# download.file(paste(url, path, file.name, sep = ""), file.name)
# Load the workbook
wb <- XLConnect::loadWorkbook("data/latitude.xlsx")
# Read data from a target section of the worksheet and print
my.data <- readWorksheet(wb, sheet = "1700", startRow = 10, startCol = 1,
endRow = 42, endCol = 2)
print(my.data)
Antigua.and.Barbuda X17.072 1 Argentina -36.67600 2 Armenia 40.25400 3 Aruba 12.51300 4 Australia -32.21900 5 Austria 48.23100 6 Azerbaijan 40.35200 7 Bahamas 24.70000 8 Bahrain 26.02400 9 Bangladesh 23.88000 10 Barbados 13.17900 11 Belarus 53.54709 12 Belgium 50.83700 13 Belize 17.84300 14 Benin 6.36400 15 Bermuda 32.21700 16 Bhutan 27.47900 17 Bolivia -15.19000 18 Bosnia and Herzegovina 44.17501 19 Botswana -21.53600 20 Brazil -19.55700 21 British Virgin Islands 18.50000 22 Brunei 4.50100 23 Bulgaria 42.07300 24 Burkina Faso 12.04900 25 Burundi -3.36500 26 Cambodia 12.02600 27 Cameroon 10.73000 28 Canada 43.72700 29 Cape Verde 15.09100 30 Cayman Islands 19.31900 31 Central African Rep. 4.33100 32 Chad 10.37700
#install.packages("xlsx", repos = "http://cran.us.r-project.org")
# Use xlsx package
library("xlsx")
my_data <- read.xlsx("data/cities.xlsx", sheetName = "year_2000")
my_data
my_data <- read.xlsx2("data/cities.xlsx", sheetName = "year_1990") # Prefer the read.xlsx2() over read.xlsx(), it’s significantly faster for large dataset.
my_data
Capital | Population |
---|---|
New York | 17800000 |
Berlin | 3382169 |
Madrid | 2938723 |
Stockholm | 1942362 |
NA | NA |
NA | NA |
NA | NA |
NA | NA |
NA | NA |
Capital | Population |
---|---|
New York | 16044000 |
Berlin | 3433695 |
Madrid | 3010492 |
Stockholm | 1683713 |
# Use readxl package to read xls|xlsxl
#install.packages("readxl")
library(readxl)
excel_sheets("data/cities.xlsx")
my_data<-read_excel("data/cities.xlsx")
my_data
read_excel("data/cities.xlsx", sheet = 2)
read_excel("data/cities.xlsx", sheet = "year_2000")
# col_names = FALSE: R assigns names itself
# col_names = character vector: manually specify
Capital | Population |
---|---|
New York | 16044000 |
Berlin | 3433695 |
Madrid | 3010492 |
Stockholm | 1683713 |
Capital | Population |
---|---|
New York | 17800000 |
Berlin | 3382169 |
Madrid | 2938723 |
Stockholm | 1942362 |
Capital | Population |
---|---|
New York | 17800000 |
Berlin | 3382169 |
Madrid | 2938723 |
Stockholm | 1942362 |
# working with xml file from url link
library(RCurl)
library(XML, lib.loc="~/R/win-library/3.4")
# library(RCurl, lib.loc="~/R/win-library/3.4")
fileURL <- "https://www.w3schools.com/xml/simple.xml"
xData <- getURL(fileURL)
doc <- xmlParse(xData)
rootNode <- xmlRoot(doc)
xmldataframe1 <- xmlToDataFrame(xData)
xmldataframe1
# working with local file
result <- xmlParse("data/books.xml")
print(result)
rootNode <- xmlRoot(result)
rootSize <- xmlSize(rootNode)
print(rootSize)
print(rootNode[1])
xmldataframe <- xmlToDataFrame("data/books.xml")
print(xmldataframe)
head(xmldataframe)
Loading required package: bitops
Error in library(XML, lib.loc = "~/R/win-library/3.4"): no library trees found in 'lib.loc' Traceback: 1. library(XML, lib.loc = "~/R/win-library/3.4") 2. stop(txt, domain = NA)
#install.packages("rjson", repos = "http://cran.us.r-project.org")
# Load the package required to read JSON files.
library("rjson", lib.loc="~/R/win-library/3.4")
# Give the input file name to the function.
result <- fromJSON(file = "data/JSN.json")
# Print the result.
print(result)
as.data.frame(result)
$ID [1] "1" "2" "3" "4" "5" "6" "7" "8" $Name [1] "Rick" "Dan" "Michelle" "Ryan" "Gary" "Nina" "Simon" [8] "Guru" $Salary [1] "623.3" "515.2" "611" "729" "843.25" "578" "632.8" "722.5" $StartDate [1] "1/1/2012" "9/23/2013" "11/15/2014" "5/11/2014" "3/27/2015" [6] "5/21/2013" "7/30/2013" "6/17/2014" $Dept [1] "IT" "Operations" "IT" "HR" "Finance" [6] "IT" "Operations" "Finance"
ID | Name | Salary | StartDate | Dept |
---|---|---|---|---|
1 | Rick | 623.3 | 1/1/2012 | IT |
2 | Dan | 515.2 | 9/23/2013 | Operations |
3 | Michelle | 611 | 11/15/2014 | IT |
4 | Ryan | 729 | 5/11/2014 | HR |
5 | Gary | 843.25 | 3/27/2015 | Finance |
6 | Nina | 578 | 5/21/2013 | IT |
7 | Simon | 632.8 | 7/30/2013 | Operations |
8 | Guru | 722.5 | 6/17/2014 | Finance |
url <- "https://en.wikipedia.org/wiki/Nandi_Award_for_Best_Actress"
# install.packages("httr", repos = "http://cran.us.r-project.org")
library(httr)
library(XML) # for readHTMLTable
urldata <- GET(url)
data <- readHTMLTable(rawToChar(urldata$content),stringsAsFactors = FALSE, as.data.frame = TRUE)
head(data)
V1 | V2 | V3 | V4 |
---|---|---|---|
2016 | Ritu Varma | Pelli Choopulu | NA |
2015 | Anushka Shetty | Size Zero | NA |
2014 | Anjali | Geethanjali | NA |
2013 | Anjali Patil | Naa Bangaaru Talli | NA |
2012 | Samantha Ruth Prabhu | Yeto Vellipoyindhi Manasu | NA |
2011 | Nayantara | Sri Rama Rajyam | NA |
2010 | Nithya Menen[5] | Ala Modalaindi | NA |
2009 | Thirtha[6] | Sontha Ooru | NA |
2008 | Swathi | Ashta Chamma | NA |
2007 | Charmy Kaur | Mantra | NA |
2006 | Nandita Das | Kamli | NA |
2005 | Trisha | Nuvvostanante Nenoddantana | NA |
2004 | Kamalinee Mukherjee | Anand | NA |
2003 | Bhoomika Chawla | Missamma | NA |
2002 | Kalyani | Avunu Valliddaru Ista Paddaru | NA |
2001 | Laya[7] | Preminchu | NA |
2000 | Laya[8] | Manoharam | NA |
1999 | Maheswari | Nee Kosam | NA |
1998 | Ramya Krishnan | Kante Koothurne Kanu | NA |
1998 | Soundarya | Antahpuram | NA |
1997 | Vijayashanti | Osey Ramulamma | NA |
1996 | Soundarya | Pavithra Bandham | NA |
1995 | Aamani | Subha Sankalpam | NA |
1994 | Soundarya | Ammoru | NA |
1993 | Aamani | Mr. Pellam | NA |
1992 | Meena | Rajeshwari Kalyanam | NA |
1991 | Sridevi | Kshana Kshanam | NA |
1990 | VijayashantiMeena | KarthavyamSeetharamaiah Gari Manavaralu | |
1989 | Vijayashanti | Bharatha Naari | NA |
1988 | Bhanupriya | Swarna Kamalam | NA |
1987 | Sumalatha | Sruthilayalu | NA |
1986 | Lakshmi | Sravana Megalu | NA |
1985 | Vijayashanti | Pratighatana | NA |
1984 | Suhasini | Swathi | NA |
1983 | Jayasudha | Dharmaatmudu | NA |
1982 | Jayasudha | Meghasandesam | NA |
1981 | Jayasudha | Premabhishekam | NA |
1980 | Sakuntala | Kukka | NA |
1979 | Jayasudha | Idi Katha Kaadu | NA |
1978 | Roopa | Naalaaga Endaro | NA |
1977 | Lakshmi[1] | Pantulamma | NA |
1976 | Jayaprada[1] | Anthuleni Kadha | NA |
1975 | Jayasudha[1] | Jyothi | NA |
dataDF<-as.data.frame(data)
head(dataDF)
dataDF$NULL.V4 <- NULL
names(dataDF) <- c("Year", "Actress", "Film")
dim(dataDF)
print(dataDF)
Year Actress Film 1 2016 Ritu Varma Pelli Choopulu 2 2015 Anushka Shetty Size Zero 3 2014 Anjali Geethanjali 4 2013 Anjali Patil Naa Bangaaru Talli 5 2012 Samantha Ruth Prabhu Yeto Vellipoyindhi Manasu 6 2011 Nayantara Sri Rama Rajyam 7 2010 Nithya Menen[5] Ala Modalaindi 8 2009 Thirtha[6] Sontha Ooru 9 2008 Swathi Ashta Chamma 10 2007 Charmy Kaur Mantra 11 2006 Nandita Das Kamli 12 2005 Trisha Nuvvostanante Nenoddantana 13 2004 Kamalinee Mukherjee Anand 14 2003 Bhoomika Chawla Missamma 15 2002 Kalyani Avunu Valliddaru Ista Paddaru 16 2001 Laya[7] Preminchu 17 2000 Laya[8] Manoharam 18 1999 Maheswari Nee Kosam 19 1998 Ramya Krishnan Kante Koothurne Kanu 20 1998 Soundarya Antahpuram 21 1997 Vijayashanti Osey Ramulamma 22 1996 Soundarya Pavithra Bandham 23 1995 Aamani Subha Sankalpam 24 1994 Soundarya Ammoru 25 1993 Aamani Mr. Pellam 26 1992 Meena Rajeshwari Kalyanam 27 1991 Sridevi Kshana Kshanam 28 1990 VijayashantiMeena KarthavyamSeetharamaiah Gari Manavaralu 29 1989 Vijayashanti Bharatha Naari 30 1988 Bhanupriya Swarna Kamalam 31 1987 Sumalatha Sruthilayalu 32 1986 Lakshmi Sravana Megalu 33 1985 Vijayashanti Pratighatana 34 1984 Suhasini Swathi 35 1983 Jayasudha Dharmaatmudu 36 1982 Jayasudha Meghasandesam 37 1981 Jayasudha Premabhishekam 38 1980 Sakuntala Kukka 39 1979 Jayasudha Idi Katha Kaadu 40 1978 Roopa Naalaaga Endaro 41 1977 Lakshmi[1] Pantulamma 42 1976 Jayaprada[1] Anthuleni Kadha 43 1975 Jayasudha[1] Jyothi
# library (plyr)
# df <- ldply (data, data.frame)
# dim(df)
# head(df)
further reference: https://www.apress.com/in/book/9781461478997
rvest in action
library(rvest)
# install.packages("xml2", repos = "http://cran.us.r-project.org")
library(xml2,lib.loc="~/R/win-library/3.4")
lego_movie <- read_html("http://www.imdb.com/title/tt1490017/")
#print(lego_movie)
lego_movie %>%
html_node("strong span") %>%
html_text() %>%
as.numeric()
# selector:
lego_movie %>%
html_nodes("#titleCast .itemprop span") %>%
html_text()
# The titles and authors of recent message board postings are stored in a the third table on the
Your code contains a unicode char which cannot be displayed in your current locale and R will silently convert it to an escaped form when the R kernel executes this code. This can lead to subtle errors if you use such chars to do comparisons. For more information, please see https://github.com/IRkernel/repr/wiki/Problems-with-unicode-on-windows
browseVignettes()
to check the any sample codes of the conceptsobesity = read_html("https://en.wikipedia.org/wiki/Obesity_in_the_United_States")
#Using rvest to Scrape an HTML Table by taking table number
obesity = obesity %>%
html_nodes("table") %>%
.[[2]]%>%
html_table(fill=T)
dim(obesity)
head(obesity)
tail(obesity)
States, District, & Territories | Obese adults (mid-2000s) | Obese adults (2016)[52][57] | Overweight (incl. obese) adults (mid-2000s) | Obese children and adolescents (mid-2000s)[58] | Obesity rank |
---|---|---|---|---|---|
Alabama | 30.1% | 35.7% | 65.4% | 16.7% | 3 |
Alaska | 27.3% | 31.4% | 64.5% | 11.1% | 14 |
American Samoa | — | 75%[56] | 95%[59] | 35%[56][60] | — |
Arizona | 23.3% | 29.0% | 59.5% | 12.2% | 40 |
Arkansas | 28.1% | 35.7% | 64.7% | 16.4% | 9 |
California | 23.1% | 25.0% | 59.4% | 13.2% | 41 |
States, District, & Territories | Obese adults (mid-2000s) | Obese adults (2016)[52][57] | Overweight (incl. obese) adults (mid-2000s) | Obese children and adolescents (mid-2000s)[58] | Obesity rank | |
---|---|---|---|---|---|---|
51 | Virgin Islands (U.S.) | — | 32.5% | — | — | — |
52 | Virginia | 25.2% | 29.0% | 61.6% | 13.8% | 27 |
53 | Washington | 24.5% | 28.6% | 60.7% | 10.8% | 32 |
54 | West Virginia | 30.6% | 37.7% | 66.8% | 20.9% | 2 |
55 | Wisconsin | 25.5% | 30.7% | 62.4% | 13.5% | 25 |
56 | Wyoming | 24.0% | 27.7% | 61.7% | 8.7% | 33 |
Steps
library(magrittr) # for the pipe operator
library(xml2) # for read_html() function
library(rvest) # for html_nodes() and html_table()
url <- "https://en.wikipedia.org/wiki/List_of_districts_in_Telangana"
districts <- url %>%
read_html() %>%
html_nodes(xpath='//*[@id="mw-content-text"]/div/table') %>%
html_table()
districts <- districts[[1]]
head(districts)
tail(districts)
# //*[@id="mw-content-text"]/div/table[1]/tbody/tr[7]/td[3]
S.No. | Name | Headquarters | Area (km2) | Population (2011 census) | No.of mandals | Density (per km2) | Urban (%) | Literacy (%) | Sex Ratio |
---|---|---|---|---|---|---|---|---|---|
1 | Adilabad | Adilabad | 4,153 | 708,972 | 18 | 171 | 23.66 | 63.46 | 989 |
2 | Bhadradri Kothagudem | Kothagudem | 7,483 | 1,069,261 | 23 | 143 | 31.71 | 66.40 | 1008 |
3 | Hyderabad | Hyderabad | 217 | 3,943,323 | 16 | 18172 | 100.00 | 83.25 | 954 |
4 | Jagtial | Jagtial | 2,419 | 985,417 | 18 | 407 | 22.46 | 60.26 | 1036 |
5 | Jangaon | Jangaon | 2,188 | 566,376 | 13 | 259 | 12.60 | 61.44 | 997 |
6 | Jayashankar Bhupalapally | Bhupalpalle | 6,175 | 711,434 | 20 | 115 | 7.57 | 60.33 | 1009 |
S.No. | Name | Headquarters | Area (km2) | Population (2011 census) | No.of mandals | Density (per km2) | Urban (%) | Literacy (%) | Sex Ratio | |
---|---|---|---|---|---|---|---|---|---|---|
27 | 27 | Vikarabad[9] | Vikarabad | 3,386 | 927,140 | 18 | 274 | 13.48 | 57.91 | 1001 |
28 | 28 | Wanaparthy | Wanaparthy | 2,152 | 577,758 | 14 | 268 | 15.97 | 55.67 | 960 |
29 | 29 | Warangal Rural | Warangal | 2,175 | 718,537 | 15 | 330 | 6.99 | 61.26 | 994 |
30 | 30 | Warangal Urban | Warangal | 1,309 | 1,080,858 | 11 | 826 | 68.51 | 76.17 | 997 |
31 | 31 | Yadadri Bhuvanagiri | Bhongir | 3,092 | 739,448 | 16 | 239 | 16.66 | 65.53 | 973 |
32 | Telangana | - | - | 112,077 | 35,003,674 | 584 | 312 | 38.88 | 66.54 | 988 |
Further References: https://stat4701.github.io/edav/2015/04/02/rvest_tutorial/
# install.packages("haven", repos = "http://cran.us.r-project.org")
library(haven)
ontime <- read_sas("data/sales.sas7bdat")
head(ontime)
Warning message: "package 'haven' was built under R version 3.4.3"
purchase | age | gender | income |
---|---|---|---|
0 | 41 | Female | Low |
0 | 47 | Female | Low |
1 | 41 | Female | Low |
1 | 39 | Female | Low |
0 | 32 | Female | Low |
0 | 32 | Female | Low |
trade <- read_stata("data/trade.dta")
head(trade)
trade <- read_dta("data/trade.dta")
head(trade)
Date | Import | Weight_I | Export | Weight_E |
---|---|---|---|---|
10 | 37664782 | 54029106 | 54505513 | 93350013 |
9 | 16316512 | 21584365 | 102700010 | 158000010 |
8 | 11082246 | 14526089 | 37935000 | 88000000 |
7 | 35677943 | 55034932 | 48515008 | 112000005 |
6 | 9879878 | 14806865 | 71486545 | 131800000 |
5 | 1539992 | 1749318 | 12311696 | 18500014 |
Date | Import | Weight_I | Export | Weight_E |
---|---|---|---|---|
10 | 37664782 | 54029106 | 54505513 | 93350013 |
9 | 16316512 | 21584365 | 102700010 | 158000010 |
8 | 11082246 | 14526089 | 37935000 | 88000000 |
7 | 35677943 | 55034932 | 48515008 | 112000005 |
6 | 9879878 | 14806865 | 71486545 | 131800000 |
5 | 1539992 | 1749318 | 12311696 | 18500014 |
Anxiety <- read_spss("data/Anxiety 2.sav")
Anxiety
subject | anxiety | tension | trial1 | trial2 | trial3 | trial4 |
---|---|---|---|---|---|---|
1 | 1 | 1 | 18 | 14 | 12 | 6 |
2 | 1 | 1 | 19 | 12 | 8 | 4 |
3 | 1 | 1 | 14 | 10 | 6 | 2 |
4 | 1 | 2 | 16 | 12 | 10 | 4 |
5 | 1 | 2 | 12 | 8 | 6 | 2 |
6 | 1 | 2 | 18 | 10 | 5 | 1 |
7 | 2 | 1 | 16 | 10 | 8 | 4 |
8 | 2 | 1 | 18 | 8 | 4 | 1 |
9 | 2 | 1 | 16 | 12 | 6 | 2 |
10 | 2 | 2 | 19 | 16 | 10 | 8 |
11 | 2 | 2 | 16 | 14 | 10 | 9 |
12 | 2 | 2 | 16 | 12 | 8 | 8 |
library(foreign)
# from SAS
#nstall.packages("sas7bdat", repos = "http://cran.us.r-project.org")
library(sas7bdat)
mydata <- read.sas7bdat("data/sales.sas7bdat")
head(mydata)
# from stata
mydata <- read.dta("data/trade.dta")
head(mydata)
# from SPSS
mySPSSData <- read.spss("data/Anxiety 2.sav",
to.data.frame=TRUE,
use.value.labels=FALSE)
head(mySPSSData)
purchase | age | gender | income |
---|---|---|---|
0 | 41 | Female | Low |
0 | 47 | Female | Low |
1 | 41 | Female | Low |
1 | 39 | Female | Low |
0 | 32 | Female | Low |
0 | 32 | Female | Low |
Date | Import | Weight_I | Export | Weight_E |
---|---|---|---|---|
2013-12-31 | 37664782 | 54029106 | 54505513 | 93350013 |
2012-12-31 | 16316512 | 21584365 | 102700010 | 158000010 |
2011-12-31 | 11082246 | 14526089 | 37935000 | 88000000 |
2010-12-31 | 35677943 | 55034932 | 48515008 | 112000005 |
2009-12-31 | 9879878 | 14806865 | 71486545 | 131800000 |
2008-12-31 | 1539992 | 1749318 | 12311696 | 18500014 |
re-encoding from CP1252
subject | anxiety | tension | trial1 | trial2 | trial3 | trial4 |
---|---|---|---|---|---|---|
1 | 1 | 1 | 18 | 14 | 12 | 6 |
2 | 1 | 1 | 19 | 12 | 8 | 4 |
3 | 1 | 1 | 14 | 10 | 6 | 2 |
4 | 1 | 2 | 16 | 12 | 10 | 4 |
5 | 1 | 2 | 12 | 8 | 6 | 2 |
6 | 1 | 2 | 18 | 10 | 5 | 1 |
# from SAS
# from stata
# from SPSS
This R Data Import Tutorial Is Everything You Need https://www.datacamp.com/community/tutorials/r-data-import-tutorial
rio
package)¶{r}library("rio")
# Importing data is handled with one function, import():
x <- import("mtcars.csv")
y <- import("mtcars.rds")
z <- import("mtcars.sav")
# Exporting data is handled with one function, export():
export(mtcars, "mtcars.csv") # comma-separated values
export(mtcars, "mtcars.rds") # R serialized
export(mtcars, "mtcars.sav") # SPSS```