Data Type Conversion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
judge is
methods(is) all uses of #is
Determine whether it is a data frame
is.data.frame(data)
convert
where vectors can be converted to a variety of data
matrix to data frame
data <- as.data.frame(data)
Convert data frame to matrix
as.matrix()
convert to factor
as.factor()
No need for names, remove column names
name()
convert to vector
unlist()
Subsetting - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Take specific rows and columns of a data frame
data1 <-data[c(1:50),c(1:30)] #Continuously extract rows and columns
data2 <-data[c(1,3,4,5),c(1,12,15)] #non-continuous extraction of rows and columns
Filter with logical values
data3 <-data[which(data$factor == 7)] #choose factor 7
data4 <- data[which(data$factor > 7 & data$factor <= 100]
subset function
data4 <- subset(data, data$factor > 7 & data$factor <= 100]
sample sampling
Random sampling with and without return is possible
sample(x, num, replace = True) #x is the sample to be sampled, num refers to the number of samples, replaceT refers to replacement
Merge - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cbind(dataframe, factor) #merge columns
rbind() #Each column in the row must be the same as the original data
merge(x,y ,by= "") # by refers to what merge
Flip - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Row and column flip t()
tdata<- t(data)
flip a single line rev()
rev(vector)
eg reverse the line
women[rev(rownames(women)),]
Revise- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
deduplication of data
duplicated(data) #return duplicate boolean value
data[!duplicated(data),] #Take out the non-duplicated part
unique(data) # take out the non-repeating part in one step
Modify a column transform()
transform(women, height = height*2.54) #Original data operation
transform(women, cm = height*2.54) #Generate a new column
Sort - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Sort on a single condition
sort()
sort() #The default number is from small to large, and English is sorted by the first letter
rev(sort()) #sort in reverse
#sort cannot be used for data frame sorting, only vectors can be used, but it can save the country with curves
mtcars[sort(rownames(mtcars)), ]
order()
#Returns the position of the vector instead of the sorted result
mtcars[order(mtcars$mpg), ]
Sort by multiple criteria
mtcars[order(mtcars$mpg, mtcars$disp), ]
Commonly used data conversion packages - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
reshape2
wide to long
melt(data, id.vars = c("col1", "col2" )) #melt data, change wide data into long data, id.vars is to be kept
Long and wide
dcast(aql, month+day ~variable) #According to the variable column brother level as the column name, the month and day column as the id is placed on the far left, and the remaining variables are appended to the new data set from left to right
let's
tidydata: An observation and a variable determine a value
wide to long
gather()
Long and wide
spread()
A column is divided into multiple columns
separate()
e.g
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
separete(df,col = x, into = c("A", "B"),sep = "") #The separator will be recognized by default, but it can also be specified by sep
Merge multiple columns into one column
unit()
e.g
unite(x, col = "AB", A, B, sep = "-")
dplyr
filter
* :: is to prevent conflicts between packages with the same function name
filter()
e.g
dplyr::filter(iris, Sopal.Length >7) #filter out iris calyx length <7
remove duplicate rows
dplyr::distinct(data)
Slicing out any row
dplyr::slice(iris,10:15) #Take out 10-15 lines
sampling
dplyr::sample_n(iris, 10) # Randomly select ten lines
dplyr::sample_frac(iris,0.1) #Random selection in proportion
to sort
dplyr::arrange(iris, Sopal.Length) #sort by sepal length
dplyr::arrange(iris, desc(Sopal.Length)) #Sort in the opposite direction
Subset
select()
statistics
summarise(iris, avg = mean(Sopal.Length)) #calculate the average length of the sepal
group
dplyr::group_by(iris, Species)
iris %>% group_by(Species)
add variable
dplyr::mutate(iris, new = Separ.Length + Petal.Length)
multi-table operation
dplyr::left_join() #left link
dplyr::right_join() #right link
#Inner link is the intersection
dplyr::full_join() #Full link is a union
dplyr::semi_join() #Semi-link: filter the left table according to the content of the right table
dplyr::anti_join() #anti-link: output complement
Multi-dataset operations
intersect() #take the intersection
union_all() #take union
setdiff() #take complement set
Pipe character (chain operator %>%) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Implement passing the output of one function to the next function as input to the next function
Available ctrl + shift + M shortcut key output
Mathematical calculations on data frames - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
row sum rowSums()
rs <- rowSums(worldphones)
total <- cbind(worldphones,Total = rs) #add this line
Column Means colMeans()
cm<- colMeans(worldphones)
apply() is more versatile, so use it
apply(worldphones, MARGIN = 1, FUN = sum)
MARGIN: 1 represents row processing, 2 represents column processing
lapply() returns a list
sapply() returns a vector/matrix
tapply() processes factor data, group by factor and then process
tapply(state.name, state.dicision, FUN = length)
Data centralization and standardization - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Function: Eliminate the impact of dimensions on data, making the difference between data smaller
centralized
The data in the data set minus the mean of the data set
x-mean(x)
standardization
After the data set is centered, it is divided by the standard deviation of the data set
x-mean(x) / sd(x)
Centralization + Standardization
scale(x,center = T, scale = T)