# Problem 1 dir = "/studier/emner/matnat/math/STK2100/v22/data/" Xtr = read.table(paste(dir,"train.csv",sep=""),header=T,sep=",")[,2:32] Xte = read.table(paste(dir,"test.csv",sep=""),header=T,sep = ",")[,2:32] # (a) fit = glm(X~.,data=Xtr,family=binomial) pred = predict(fit,Xte,type="response")>0.5 # (c) x.prcomp = prcomp(x = as.matrix(rbind(Xtr,Xte)[,-31]),retx=TRUE, scale=TRUE) Xptr = data.frame(x.prcomp$x[1:dim(Xtr)[1],],X=Xtr$X) Xpte = data.frame(x.prcomp$x[(dim(Xtr)[1]+1):(dim(Xtr)[1]+dim(Xte)[1]),], X=Xte$X) fit.pc = glm(X~.,data=Xptr[,c(1:2,31)],family=binomial) pred.pc = predict(fit.pc,Xpte[,c(1:2,31)],type="response")>0.5 X0 = runif(50000,-10,10) X1 = runif(50000,-10,10) Xnew = as.data.frame(cbind(X0,X1)) colnames(Xnew) = c("PC1","PC2") preds =predict(fit.pc,Xnew,type="response")>0.5 x1 = Xnew[preds,] x0 = Xnew[which(preds==0),] plot(x1,col = 4,xlim = c(-10,10),ylim = c(-10,10),pch=16) points(x0,col = 7,pch=16) points(Xptr[Xptr$X==0,c(1,2)],col = 3,pch=1) points(Xptr[Xptr$X==1,c(1,2)],col = 2,pch=1) points(Xpte[Xpte$X==0,c(1,2)],col = 3,pch=16) points(Xpte[Xpte$X==1,c(1,2)],col = 2,pch=16) #(d) da_mi = function(x,p1,p2,m1,S1,m2,S2) { pc1 = p1*mvtnorm::dmvnorm(x,m1,S1) pc2 = p2*mvtnorm::dmvnorm(x,m2,S2) pc2/(pc1+pc2) } # class means mu1 = colMeans(Xptr[Xptr$X==0,c(1,2)]) mu2 = colMeans(Xptr[Xptr$X==1,c(1,2)]) # class covariances for QDA Sigma1 = cov(Xptr[Xptr$X==0,c(1,2)]) Sigma2 = cov(Xptr[Xptr$X==1,c(1,2)]) # pulled estimate of the covariance for LDA: Sigma = (sum(Xptr$X==0)*Sigma1 + sum(Xptr$X==1)*Sigma2)/(length(Xptr$X)-1) # (e) knn_mi = function(x1,x2, K = 3,X) { p1 = array(0,dim(x1)[1]) for(k in 1:dim(x1)[1]) p1[k] = sqrt((X[1]-x1[k,1])^2+(X[2]-x1[k,2])^2) p2 = array(0,dim(x2)[1]) for(k in 1:dim(x2)[1]) p2[k] = sqrt((X[1]-x2[k,1])^2+(X[2]-x2[k,2])^2) P1 = 0 for(el in sort(c(p1,p2))[1:K]) { if(el %in% p1) { P1 = P1 + 1 } } return((P1/K)) } # (f) #first run RSNNS library(RSNNS) fit.dnet = mlp(as.matrix(Xptr[,1:2]),Xptr$X, size = c(4,3), learnFuncParams=c(0.3),maxit=10000) pred.dnet = predict(fit.dnet,Xpte[,1:2])>0.5 mean((Xte$X == pred.dnet)) pred.dnet = predict(fit.dnet,Xnew)>0.5 # (g) # number of SGD epochs M = 100 # weight decay lambda = 10^(-8) # SGD learning rate alpha = 0.3 # classification threshold u = 0.3 library(torch) # Below you will see an example of how to create a simple torch dataset # that pre-process a data.frame into tensors so you can feed them to # a model. df_dataset = dataset( name = "Breast cancer", # the input data to your dataset goes in the initialize function. # our dataset will take a dataframe and the name of the response # variable. initialize = function(df, feature_variables, response_variable) { self$df = df[, feature_variables] self$response_variable = df[[response_variable]] }, # the .getitem method takes an index as input and returns the # corresponding item from the dataset. .getitem = function(index) { response = torch_tensor(self$response_variable[index], dtype = torch_float()) x = torch_tensor(as.numeric(self$df[index,])) list(x = x, y = response) }, # define the .length method returning # the number of elements in the dataset. .length = function() { length(self$response_variable) } ) features = c('PC1', 'PC2') response = 'X' # create training, testing and boundary data sets cancer_train = df_dataset(Xptr, feature_variables = features, response_variable = response) cancer_test = df_dataset(Xpte, feature_variables = features, response_variable = response) Xmc = cbind(X0,X1) colnames(Xmc) = c('PC1', 'PC2') cancer_mc = df_dataset(data.frame(Xmc,X=1), feature_variables = features, response_variable = response) #see 100th item cancer_train$.getitem(100) #prepare data loaders based on our data sets dl_train = dataloader(cancer_train, batch_size = 10) dl_test = dataloader(cancer_test, batch_size = 10) dl_mc = dataloader(cancer_mc, batch_size = 1000) #define a model lass net = nn_module( "CancerNet", initialize = function() { self$fc1 = nn_linear(length(features), 4,bias = F) self$fc2 = nn_linear(4, 3,bias = F) self$fc3 = nn_linear(3, 1,bias = F) }, forward = function(x) { x %>% self$fc1() %>% nnf_sigmoid() %>% self$fc2() %>% nnf_sigmoid() %>% self$fc3() } ) # create the model model = net() #first implement optimizer by hand lr = alpha for (epoch in 1:M) { l = c() coro::loop(for (b in dl_train) # same as for (b in enumerate(dl_train)) { output = model(b[[1]]) loss = nnf_binary_cross_entropy_with_logits(output,b[[2]]) for(param in model$parameters) { loss = loss + lambda*torch::torch_norm(param) } loss$backward() with_no_grad({ #manually for(param in model$parameters) { param = param$sub_(lr * param$grad) param$grad$zero_() } }) lr = lr*0.999 l = c(l, loss$item()) }) cat(sprintf("Loss at epoch %d: %3f\n", epoch, mean(l))) } #stop computing the gradients model$eval() #evaluate the model on a test set test_losses = c() corrects = 0 i = 0 coro::loop(for (b in dl_test) # same as for (b in enumerate(dl_train)) { output = model(b[[1]]) loss = nnf_binary_cross_entropy_with_logits(output, b[[2]]) corrects = corrects + sum(abs((1/(1+(-as.numeric(output)))>u) - as.integer(b$y))) test_losses = c(test_losses, loss$item()) i=i+length(as.integer(b$y)) }) mean(test_losses) corrects/(i) # prepare the data to draw the decision boundary i = 0 preds = NULL coro::loop(for (b in dl_mc) # same as for (b in enumerate(dl_train)) { output = model(b[[1]]) loss = nnf_binary_cross_entropy_with_logits(output, b[[2]]) preds = c(preds,1/(1+(-as.numeric(output)))>u) i=i+length(as.integer(b$y)) }) x1 = Xmc[which(preds==1),] x0 = Xmc[which(preds==0),] plot(x1,col = 4,xlim = c(-10,10),ylim = c(-10,10),pch=16) points(x0,col = 7,pch=16) points(Xptr[Xptr$X==0,c(1,2)],col = 3,pch=1) points(Xptr[Xptr$X==1,c(1,2)],col = 2,pch=1) points(Xpte[Xpte$X==0,c(1,2)],col = 3,pch=16) points(Xpte[Xpte$X==1,c(1,2)],col = 2,pch=16) # (i) plot.ts(x.prcomp$rotation[,1]) # (j) library(gam) fit.pc.gam = gam(X~s(PC1)+s(PC2),data=Xptr,family=binomial) pred.pc.gam = predict(fit.pc.gam,Xpte,type="response")>0.5 # (l) nam = names(Xptr)[1:30] k=20 formula = as.formula(paste("X",paste(paste("s(",nam[1:k],")",sep=""), collapse="+"), sep="~")) print(formula)