机器学习(Machine Learning and Data Mining)CS 5751——Lab2作业记录
Activity1
下载数据:https://archive.ics.uci.edu/ml/datasets/Auto+MPG
(1)创建数据
# Create the data frame.
PEOPLE <- data.frame(
age = c(86,82,83,24,23,49,48),
capital = c(98190,-148497,-69783,98458,-148436,95678,-98000)
)
print(PEOPLE)
(2)画出图像
plot(PEOPLE$age, PEOPLE$capital, main="distribution of age/capital gains", xlab="age", ylab="capital gains", pch=19, cex=3)
(3)计算距离
- 欧几里得距离Euclidean distance
dist(PEOPLE,p=2)
Is this desirable?Why?
欧几里得距离在低维度的时候,比较常用。但它并不会考虑属性之间的重要性,也不会考虑数据本身的密度。
- 曼哈顿距离(L1)
dist(PEOPLE,"manhattan")
- 马氏距离Mahalanobis distance
mahalanobis(PEOPLE,colMeans(PEOPLE), cov(PEOPLE))
知乎:为什么我们要用马氏距离
马氏距离的优点:
(1)与原始数据的测量单位无关。
(2)马哈拉诺比斯距离也消除了变量之间相关性的干扰。
Activity2
(1)从本地得到数据
MPGdata <- read.csv("auto-mpg.csv", header=FALSE, stringsAsFactor=FALSE, na.strings =c('?','',' ?'), col.names=c('mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'))
(2)找出缺失值
NA_index <-which(rowSums(is.na(MPGdata)) > 0)
MPG_NA <- MPGdata[NA_index,]
MPG_A <- MPGdata[-NA_index,]
MPG_NA
(3)用平均值代替缺失值
instead_missing_with_mean <- function(dataset){
NA_index <-which(rowSums(is.na(dataset)) > 0)
dataset_NA <- MPGdata[NA_index,]
dataset_A <- MPGdata[-NA_index,]
for(x in NA_index){
length<-length(MPGdata[x,])
y<-1
while(y<length){
if(is.na(MPGdata[x,y])){
MPGdata[x,y]<-mean(MPG_A[,y])
}
y=y+1
}
}
return(MPGdata[NA_index,])
}
instead_missing_with_mean(MPGdata)
(4)KNN(K-Nearest-Neighbour)
K-最近邻算法,是一种惰性学习算法。
library(DMwR)
MPGdata <- read.csv("auto-mpg.csv", header=FALSE, stringsAsFactor=FALSE, na.strings =c('?','',' ?'), col.names=c('mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'))
MPGnew<-MPGdata[,1:8]
MPGnew<-knnImputation(MPGnew,k=10,meth="median")
MPGnew[NA_index,]
Activity3
(1)等频率equal-frequency
equal_frequency_discretization <- function(itemlist,N){
itemlist<-na.omit(itemlist)
listlength=length(itemlist)
numbers=listlength/N
sortlist<-sort(itemlist)
list=NULL
list<-sortlist[1]
x=numbers
while(x<=listlength){
list<-c(list,sortlist[x])
x=x+numbers
}
list
return(list)
}
(2)等宽度equal-width
equal_width_discretization <- function(itemlist,N){
itemlist<-na.omit(itemlist)
max=max(itemlist)
min=min(itemlist)
seq<-seq(min, max, (max-min)/N)
return(seq)
}
(3)等频率的图
MPGdata3 <- read.csv("auto-mpg.csv", header=FALSE, stringsAsFactor=FALSE, na.strings =c('?','',' ?'), col.names=c('mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'))
linebox1<-equal_frequency_discretization(MPGdata3$mpg,3)
linebox2<-equal_frequency_discretization(MPGdata3$horsepower,3)
x<-MPGdata3$mpg
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y, main="equal-frequency discretization(mpg)", xlab="mpg", ylab="random_value", pch=19)
for(i in linebox1)
abline(v=i,lwd=4,lty=2,col="blue")
x<-MPGdata3$horsepower
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y, main="equal-frequency discretization(horsepower)", xlab="horsepower", ylab="random_value", pch=19)
for(i in linebox2)
abline(v=i,lwd=4,lty=2,col="blue")
(4)等宽度的图
linebox3<-equal_width_discretization(MPGdata3$mpg,3)
linebox4<-equal_width_discretization(MPGdata3$horsepower,3)
mpg
x<-MPGdata3$mpg
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y, main="equal-width discretization(mpg)", xlab="mpg", ylab="random_value", pch=19)
for(i in linebox3)
abline(v=i,lwd=4,lty=2,col="blue")
horsepower
x<-MPGdata3$horsepower
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y, main="equal-width discretization(horsepower)", xlab="horsepower", ylab="random_value", pch=19)
for(i in linebox4)
abline(v=i,lwd=4,lty=2,col="blue")