[Getting and Cleaning data] Week 3
19136 단어 statisticsRcourseradatascience
For more details, see the html file here.
Week 3
Subsetting and Sorting
Once you have loaded you data into R, what you might want to do is manipulate that data, and so you can set it to be a tidy data set: variables in the columns and observations in the rows, and only the observations that you want to be able to analysis.
set.seed(13435)
X "var1" = sample(1:5), "var2" = sample(6:10), "var3" = sample(11:15))
X 1:5), ]
X$var2[c(1,3)] # select the first column
X[ ,1]
X[ ,"var1"]
X$var1
# select both column and rows
X[1:2, 2]
X[1:2, "var2"]
X[1:2, ]$var2
X[(X$var1 <= 3 & X$var3 > 11), ]
X[(X$var1 <= 3 | X$var3 > 15), ]
X[which(X$var2 > 8), ]
X[X$var2 > 8, ]
From the second line, we can see that if the column has NAs, we cannot use logical statement to select observations, and we need to use
which
command. sort(X$var1)
sort(X$var1, decreasing = TRUE)
# Put NA last
sort(X$var2, na.last = TRUE)
# Put NA first
sort(X$var2, na.last = FALSE)
# NA removes
sort(X$var2)
# order data frame by variable "var1"
X[order(X$var1), ]
# order data frame by multiple variable
X[order(X$var1, X$var3), ]
plyr
package library(plyr)
arrange(X, var1) #
arrange(X, desc(var1))
# one way to add column
X$var4 5)
X
# the other way to add column
Y 5))
Y
Summarizing data
if(!file.exists("./data")) dir.create("./data")
fileUrl "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData "./data/restaurants.csv")
# top 3 rows
head(restData, 3)
# last 3 rows
tail(restData, 3)
summary(restData)
str(restData)
quantile(restData$councilDistrict, na.rm = TRUE)
quantile(restData$councilDistrict, probs = c(0.5, 0.75, 0.9))
# useNA = "ifany" gives you the NO.of NAs, the default removing NAs
table(restData$zipCode, useNA = "ifany")
# two dimensional table
table(restData$councilDistrict, restData$zipCode)
sum(is.na(restData$councilDistrict))
any(is.na(restData$councilDistrict))
all(restData$zipCode > 0)
colSums(is.na(restData))
all(colSums(is.na(restData)) == 0)
table(restData$zipCode %in% c("21212"))
table(restData$zipCode %in% c("21212", "21213"))
head(restData[restData$zipCode %in% c("21212", "21213"), ], 3)
data("UCBAdmissions")
DF as.data.frame(UCBAdmissions)
summary(DF)
# entry stands for sum of Freq
xt # this is different table: entry stands for NO. of cross observation
with(DF, table(Gender, Admit))
warpbreaks$replicate 1:9, len = 54)
xt
fakeData 1e5)
object.size(fakeData)
print(object.size(fakeData), units = "MB")
Creating new variables
Why create new variables?
if(!file.exists("./data")) dir.create("./data")
fileUrl "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData "./data/restaurants.csv")
# creating sequence by
s1 1, 10, by = 2)
s1
# creasting sequence length
s2 1, 10, length = 3)
s2
# creating sequence base on a vactor
x 1, 3, 8, 25, 100)
seq(along = x)
restData$nearMe in% c("Roland Park", "Homeland")
table(restData$nearMe)
restData$zipWrong 0, TRUE, FALSE)
table(restData$zipWrong, restData$zipCode<0)
restData$zipGroups
library(Hmisc)
restData$zipGroups 4)
table(restData$zipGroups)
restData$zcf #restData$zcf
class(restData$zcf)
yesno "yes", "no"), size = 10, replace = TRUE)
yesnofac "yes", "no"))
yesnofac
relevel(yesnofac, ref = "no")
as.numeric(yesnofac) # name the ref level be 1, the other be 2
as.numeric(relevel(yesnofac, ref = "no"))
class(restData$zipGroups)
library(Hmisc) # cut2
library(plyr) # mutate
restData2 4))
table(restData2$zipGroups)
abs(x)
absolute value sqrt(x)
square root ceiling(x)
ceiling(3.14) = 4 floor(x)
floor(3.14) = 3 round(x, digits = n)
round(3.1415926, digits = 2) = 3.14 signif(x, digits = n)
signif(3.1415926, digits = 2) = 3.1 cos(x)
or sin(x)
etc. log(x)
natural logarithm log2(x)
or log10(x)
other common logs. exp(x)
exponentiating x Reshaping data
The goal is tidy data: - Each variable forms a column - Each observation forms a row - Each table/file stores data about one kind of observation.
library(reshape2)
head(mtcars)
mtcars$carname # make multiple columns to one variable
carMelt "carname", "gear", "cyl"), measure.vars = c("mpg", "hp") )
# carMelt: a tall and skinny data set
head(carMelt, 3)
tail(carMelt, 3)
# work on the melt data set
clyData
head(InsectSprays)
tapply(InsectSprays$count, InsectSprays$spray, sum)
spIns
unlist(sprCount)
sprCount
plyr
package library(plyr)
## ddply: Split data frame, apply function, and return results in a data frame.
#ddply(InsectSprays, .(spray), summarize, sum = sum(count))
# Error in .fun(piece, ...) : argument "by" is missing, with no default
ddply(InsectSprays, .(spray), plyr::summarize, sum = sum(count))
spraySums
Managing data frame with dplyr package – Introduction
The data frame is a key data structure in statistics and in R.
dplyr
plyr
package. dplyr
Verbs: select
: return a subsets of columns of a data frame. filter
: extracts a subsets of rows of a data frame based on a logical condition. arrange
: reorder rows of a data frame. rename
: rename variables in a data frame. mutate
: add new variable/columns or transforming existing variables. summarize
/ summarise
: generate summary statistics of different variables in the data frame, possible within strata. There is also a handy print method that prevents you from printing a lot of data to the console.
Managing data frames with dplyr package – Basic tools
library(dplyr)
options(width = 105)
# download data and load data into R
fileUrl "https://raw.github.com/DataScienceSpecialization/courses/master/03_GettingData/dplyr/chicago.rds"
if(!file.exists("./data")) dir.create("./data")
download.file(fileUrl, destfile = "./data/chicago.rds", mode = "wb")
chicago "./data/chicago.Rds")
dim(chicago)
str(chicago)
names(chicago)
select
command. head(select(chicago, city:dptp))
head(select(chicago, -(city:dptp)))
i "city", names(chicago))
j "dptp", names(chicago))
head(chicago[, -(i:j)])
filter
command. chic.f 30)
head(chic.f, 3)
chic.f 30 & tmpd > 80)
head(chic.f, 3)
chicago 2)
tail(chicago, 2)
arrange
command. chicago 2)
tail(chicago, 2)
rename
command. chicago 3)
mutate
command. chicago 3)
chicago 1*(tmpd >80), labels = c("cold", "hot")))
summarize
command. hotcold as.POSIXlt(date)$year+1900)
years
%>%
command. chicago %>% mutate(month = as.POSIXlt(date)$mon + 1) %>% group_by(month) %>% summarize(pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), mo2 = median(no2tmean2))
# =
summarize(group_by(mutate(chicago, month = as.POSIXlt(date)$mon+1), month), pm25 = mean(pm25, na.rm = TRUE), o3 = max(o3tmean2), mo2 = median(no2tmean2))
Merging data
if(!file.exists("./data")) dir.create("./data")
# download data set
fileUrl1 "https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv"
fileUrl2 "https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv"
download.file(fileUrl1, destfile = "./data/reviews.csv")
download.file(fileUrl2, destfile = "./data/solution.csv")
# load data set
reviews "./data/reviews.csv")
solutions "./data/solution.csv")
# view data set
head(reviews, 2)
head(solutions, 2)
names(reviews)
names(solutions)
mergeData "solution_id", by.y = "id", all = TRUE)
head(mergeData)
intersect(names(solutions), names(reviews))
mergeData2 2)
plyr
package. df1 1:10), x = rnorm(10))
df2 1:10), y = rnorm(10))
arrange(join(df1, df2), id)
df3 1:10), z = rnorm(10))
dfList = list(df1, df2, df3)
join_all(dfList)
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
SPSS Statistics 27에서 "효과량"출력최근의 학술논문에서는 실험에서 유의한 차이가 있는지 여부를 나타내는 p-값뿐만 아니라 그 차이에 얼마나 효과가 있는지를 나타내는 효과량의 제시가 요구되고 있다. 일반적으로 두 가지 차이점은 효과량을 계산할 때 분산을...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.