机器学习(Machine Learning and Data Mining)CS 5751——Lab2作业记录

Activity1

下载数据:https://archive.ics.uci.edu/ml/datasets/Auto+MPG

(1)创建数据

# Create the data frame.
PEOPLE <- 	data.frame(
   age = c(86,82,83,24,23,49,48), 
   capital = c(98190,-148497,-69783,98458,-148436,95678,-98000)
)
print(PEOPLE)

(2)画出图像

plot(PEOPLE$age, PEOPLE$capital,  main="distribution of age/capital gains", xlab="age",  ylab="capital gains", pch=19, cex=3)

在这里插入图片描述

(3)计算距离

  • 欧几里得距离Euclidean distance
dist(PEOPLE,p=2)

Is this desirable?Why?
欧几里得距离在低维度的时候,比较常用。但它并不会考虑属性之间的重要性,也不会考虑数据本身的密度。

  • 曼哈顿距离(L1)
dist(PEOPLE,"manhattan")
  • 马氏距离Mahalanobis distance
mahalanobis(PEOPLE,colMeans(PEOPLE), cov(PEOPLE))

在这里插入图片描述
知乎:为什么我们要用马氏距离
马氏距离的优点:
(1)与原始数据的测量单位无关。
(2)马哈拉诺比斯距离也消除了变量之间相关性的干扰。

Activity2

(1)从本地得到数据

MPGdata <- read.csv("auto-mpg.csv",	header=FALSE,	stringsAsFactor=FALSE, 	na.strings =c('?','',' ?'), col.names=c('mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'))

(2)找出缺失值

NA_index <-which(rowSums(is.na(MPGdata)) > 0)
MPG_NA <- MPGdata[NA_index,]
MPG_A <- MPGdata[-NA_index,]
MPG_NA

(3)用平均值代替缺失值

instead_missing_with_mean <- function(dataset){
  NA_index <-which(rowSums(is.na(dataset)) > 0)
  dataset_NA <- MPGdata[NA_index,]
  dataset_A <- MPGdata[-NA_index,]
  for(x in NA_index){
  length<-length(MPGdata[x,])
  y<-1
  while(y<length){
    if(is.na(MPGdata[x,y])){
       MPGdata[x,y]<-mean(MPG_A[,y])
      }
    y=y+1
    }
  }
  return(MPGdata[NA_index,])
} 

instead_missing_with_mean(MPGdata)

(4)KNN(K-Nearest-Neighbour)

K-最近邻算法,是一种惰性学习算法。

library(DMwR)
MPGdata <- read.csv("auto-mpg.csv",	header=FALSE,	stringsAsFactor=FALSE, 	na.strings =c('?','',' ?'), col.names=c('mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'))
MPGnew<-MPGdata[,1:8]
MPGnew<-knnImputation(MPGnew,k=10,meth="median")
MPGnew[NA_index,]

Activity3

(1)等频率equal-frequency

equal_frequency_discretization <- function(itemlist,N){
  itemlist<-na.omit(itemlist)
  listlength=length(itemlist)
  numbers=listlength/N
  sortlist<-sort(itemlist)
  list=NULL
  list<-sortlist[1]
  x=numbers
  while(x<=listlength){
    list<-c(list,sortlist[x])
    x=x+numbers
  }
  list
  return(list)
} 

(2)等宽度equal-width

equal_width_discretization <- function(itemlist,N){
  itemlist<-na.omit(itemlist)
  max=max(itemlist)
  min=min(itemlist)
  seq<-seq(min, max, (max-min)/N)
  return(seq)
} 

(3)等频率的图

MPGdata3 <- read.csv("auto-mpg.csv",	header=FALSE,	stringsAsFactor=FALSE, 	na.strings =c('?','',' ?'), col.names=c('mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'))
linebox1<-equal_frequency_discretization(MPGdata3$mpg,3)
linebox2<-equal_frequency_discretization(MPGdata3$horsepower,3)
x<-MPGdata3$mpg
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y,  main="equal-frequency discretization(mpg)", xlab="mpg",  ylab="random_value", pch=19)
for(i in linebox1)
  abline(v=i,lwd=4,lty=2,col="blue")

orsepower

x<-MPGdata3$horsepower
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y,  main="equal-frequency discretization(horsepower)", xlab="horsepower",  ylab="random_value", pch=19)
for(i in linebox2)
  abline(v=i,lwd=4,lty=2,col="blue")

在这里插入图片描述

(4)等宽度的图

linebox3<-equal_width_discretization(MPGdata3$mpg,3)
linebox4<-equal_width_discretization(MPGdata3$horsepower,3)

mpg

x<-MPGdata3$mpg
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y,  main="equal-width discretization(mpg)", xlab="mpg",  ylab="random_value", pch=19)
for(i in linebox3)
  abline(v=i,lwd=4,lty=2,col="blue")

在这里插入图片描述

horsepower

x<-MPGdata3$horsepower
y<-runif(length(x),min=0,max=100) #random_value
plot(x, y,  main="equal-width discretization(horsepower)", xlab="horsepower",  ylab="random_value", pch=19)
for(i in linebox4)
  abline(v=i,lwd=4,lty=2,col="blue")

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/yinxx325/article/details/83241036
今日推荐