#= X, Y = importData("spectra"); splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42); split = createDataSplit(splits, nTrainValTest, 5, X, Y); size(split["XTrain"]) ### X, G = importData("OVALung"); splitStrat, nTVT = createDataSplitBinaryStratified(G, 5); my_split = createDataSplit(splitStrat, nTVT, 2, X, G); XVal = my_split["XVal"]; =# using Random """ calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6) calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6) Calculates RMSECV. Second function calculates RMSECV according to data split given by variable 'split' (which should be output of function computeCVSplitInds). Returns rmsecv, meanrmsecv, where rmsecv is kmax x n_splits matrix, and meanrmsecv is vector of length kmax. """ function calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6) splits = createCVSplitInds(X, n_splits, n_folds, rngseed); rmsecv, meanrmsecv = calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc=emscpreproc, emscdegree=emscdegree); return rmsecv, meanrmsecv end function calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6) n = size(X, 1); n_splits = size(splits,2); B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions? kmax = size(B, 2); rmsecv = zeros(kmax, n_splits); n_folds = length(unique(splits[:,1])) for i in 1:n_splits for j=1:n_folds XTrain = X[splits[:,i] .!= j,:]; XTest = X[splits[:,i] .== j,:]; yTrain = y[splits[:,i] .!= j,:]; yTest = y[splits[:,i] .== j,:]; if emscpreproc XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF XTest, _ = EMSC(XTest, output["model"]); end B, _ = regfunction(XTrain, yTrain, funcargs...); for k=1:kmax yTestPred, _ = predRegression(XTest, B[:,k], yTest); rmsecv[k, i] += sum((yTestPred - yTest).^2); end end end rmsecv = sqrt.(rmsecv ./ n); meanrmsecv = mean(rmsecv, dims=2); return rmsecv, meanrmsecv end """ function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42) Creates n_splits data splits for n_folds cross validation. Output is an size(X,1) x n_splits array with type Int64 where the columns indicate split membership according to integer (1,...,n_folds). Extra samples if any are assigned to the lower indices. """ function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42) n = size(X,1) splits = convert(Matrix{Int64}, zeros(n, n_splits)); # fold membership coded as 1, 2, ..., n_folds fold_size = convert(Int64, floor(n/n_folds)) left_over = n - fold_size * n_folds start_vec = convert(Vector{Int64}, zeros(n)); for i=1:n_folds start_vec[(fold_size*(i-1)+1):fold_size*i] .= i; end for i=1:left_over start_vec[fold_size*n_folds+i] = i; end Random.seed!(rngseed); for i=1:n_splits splits[:, i] = shuffle(start_vec); end return splits end """ function modelSelection(results, results_type, selection_rule="min") ### Inputs - results : Matrix/Tensor with results, output from calculateRMSE[CV]. - results_type : "k-fold" or "train-val-test". - selection_rule : "min" only for now. Can add 1 S.E., Chi^2, etc. ### Outputs: - results_sel : Results for the selected number of components. - n_comps : The number of components chosen for each split. """ function modelSelection(results, results_type, selection_rule="min") if results_type == "k-fold" n_splits = size(results, 2); n_comps = convert(Vector{Int64}, zeros(n_splits)); results_sel = zeros(n_splits); for i in 1:n_splits _, n_comps[i] = findmin(results[:,i]); results_sel[i] = results[n_comps[i], i]; end elseif results_type == "train-val-test" n_iter = size(results, 3); results_sel = zeros(n_iter); n_comps = convert(Vector{Int64}, zeros(n_iter)); for i=1:n_iter _, n_comps[i] = findmin(results[2,:,i]); results_sel[i] = results[3, n_comps[i], i]; end end return results_sel, n_comps end """ function predRegression(X, beta, y) function predRegression(X::Vector{Float64}, beta, y) Prediction for linear model of form y = X * beta [+ b0] Returns ypred, rmsep """ function predRegression(X, beta, y) if length(beta) == size(X,2) ypred = X * beta; elseif length(beta) == size(X,2) + 1 ypred = beta[1] .+ X * beta[2:end] end rmsep = sqrt(mean(y - ypred).^2) return ypred, rmsep end function predRegression(X::Vector{Float64}, beta, y) if length(beta) == length(X) ypred = X' * beta; elseif length(beta) == length(X) + 1 ypred = beta[1] .+ X' * beta[2:end]; end rmsep = sqrt(mean(y - ypred).^2) return ypred, rmsep end """ function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42) function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs) Calculates rmse for train/val/test sets for regfunction with args funcargs. Assume regfunction takes arguments of form "regfunction(X, y, funcargs...)" and first return argument is regression coefficients. Second function uses the splits given as input. """ function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42, emscpreproc=false, emscdegree=6) splits, nTrainValTest = createDataSplitInds(X, nsplits, props, rngseed); meanrmse, rmse = calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc, emscdegree); return meanrmse, rmse end function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc=false, emscdegree=6) nsplits = size(splits,2); B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions? kmax = size(B, 2); rmse = zeros(3, kmax, nsplits); for i in 1:nsplits split = createDataSplit(splits, nTrainValTest, i, X, y); XTrain = split["XTrain"]; XVal = split["XVal"]; XTest = split["XTest"]; yTrain = split["YTrain"]; yVal = split["YVal"]; yTest = split["YTest"]; if emscpreproc XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF XVal, _ = EMSC(XVal, output["model"]); XTest, _ = EMSC(XTest, output["model"]); end B, _ = regfunction(XTrain, yTrain, funcargs...); for j=1:kmax _, rmse[1, j, i] = predRegression(XTrain, B[:,j], yTrain); _, rmse[2, j, i] = predRegression(XVal, B[:,j], yVal); _, rmse[3, j, i] = predRegression(XTest, B[:,j], yTest); end end meanrmse = dropdims(mean(rmse, dims=3), dims=3); return rmse, meanrmse end """ createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) Creates training/validation/test split for dataset ### Arguments - 'X' : Data matrix or integer indicating number of samples - 'nSplits' : Int, number of data splits to create - 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1 - 'rngseed' : Int, for reproducibility ### Returns - 'splits' : size(X,1) x nSplits matrix with indices - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing """ function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) n = size(X, 1); nTrainValTest = zeros(3); splits = convert(Matrix{Int64}, zeros(n, nSplits)); if abs(sum(props) - 1) > 1e-14 error("Error! Proportions do not add up to 1"); end nTrain = convert(Int64, floor(n * props[1])); nVal = convert(Int64, floor(n * props[2])); nTest = convert(Int64, floor(n * props[3])); nLeft = n - (nTrain + nVal + nTest); nTrain = nTrain + nLeft Random.seed!(rngseed); for i in 1:nSplits inds = randperm(n); splits[1:nTrain , i] = inds[1:nTrain]; splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal]; splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end]; end nTrainValTest = [nTrain, nVal, nTest]; return splits, nTrainValTest end function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) n = X; nTrainValTest = zeros(3); splits = convert(Matrix{Int64}, zeros(n, nSplits)); if abs(sum(props) - 1) > 1e-14 error("Error! Proportions do not add up to 1"); end nTrain = convert(Int64, floor(n * props[1])); nVal = convert(Int64, floor(n * props[2])); nTest = convert(Int64, floor(n * props[3])); nLeft = n - (nTrain + nVal + nTest); nTrain = nTrain + nLeft Random.seed!(rngseed); for i in 1:nSplits inds = randperm(n); splits[1:nTrain , i] = inds[1:nTrain]; splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal]; splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end]; end nTrainValTest = [nTrain, nVal, nTest]; return splits, nTrainValTest end """ createDataSplit(splits, nTrainValTest, splitInd, X, Y) Creates training/validation/test split for dataset ### Arguments - 'splits' : size(X,1) x nSplits matrix with indices - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing - 'splitInd' : Index of split to use - 'X' and 'Y' : Data matrix and response vector/matrix ### Returns dictionary with keys XTrain, XVal, XTest, YTrain, YVal, YTest """ function createDataSplit(splits, nTrainValTest, splitInd, X, Y) XTrain = X[splits[1 : nTrainValTest[1], splitInd], :]; XVal = X[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :]; XTest = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :]; YTrain = Y[splits[1 : nTrainValTest[1], splitInd], :]; YVal = Y[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :]; YTest = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :]; return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]); end """ createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2 ### Arguments - 'G' : Vector{Int64}, group membership coded as 1, 2 - 'nSplits' : Int, number of data splits - 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1 - 'rngseed' : Int, for reproducibility ### Returns - 'splits' : size(X,1) x nSplits matrix with indices - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing """ function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) n = length(G); splits = convert(Matrix{Int64}, zeros(n, nSplits)); indsG1 = findall( @. abs(G - 1.0) < 1e-14) indsG2 = findall( @. abs(G - 2.0) < 1e-14) nG1 = length(indsG1); nG2 = length(indsG2); propG1 = nG1 / n propG2 = 1 - propG1; nTrain1 = convert(Int64, floor(n * props[1] * propG1)); nVal1 = convert(Int64, floor(n * props[2] * propG1)); nTest1 = convert(Int64, floor(n * props[3] * propG1)); nTrain2 = convert(Int64, floor(n * props[1] * propG2)); nVal2 = convert(Int64, floor(n * props[2] * propG2)); nTest2 = convert(Int64, floor(n * props[3] * propG2)); # Unassigned samples (if any) are added to the training set) nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1); nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2); nTrain1 = nTrain1 + nLeftG1; nTrain2 = nTrain2 + nLeftG2; nTrain = nTrain1 + nTrain2; nVal = nVal1 + nVal2; nTest = nTest1 + nTest2; if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n error("Error! Some samples are not assigned!"); end Random.seed!(rngseed); for i in 1:nSplits G1IndsShuffle = randperm(nG1); G2IndsShuffle = randperm(nG2); splits[1 : nTrain , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]]; splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]]; splits[nTrain+nVal+1 : end , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60 end nTrainValTest = [nTrain, nVal, nTest]; return splits, nTrainValTest end """ importData(datasetName, datasetPath="/home/jovyan/Datasets/") Example: X, y, waves = importData("Beer"); Returns X, Y/G, [waves], +more for some datasets Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb, leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original, Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11 """ function importData(datasetName, datasetPath="/home/jovyan/Datasets/") #datasetName = "Beer" #datasetPath = "/home/jovyan/Datasets/" vars = matread(string(datasetPath,datasetName,".mat")); if datasetName == "9tumors" X = vars["X"]; G = vars["G"]; elseif datasetName == "bacteria_K4_K5_K6" # Causes crash for some reason elseif datasetName == "Beer" X = vars["X"]; Y = vars["Y"]; # G for some datasets waves = 400:2:2250; return X, Y, waves elseif datasetName == "Dough" X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]]; #if response != "all" # Y = Y[:,response]; # Y = dropdims(Y;dims=2); #end return X, Y elseif datasetName == "FTIR_AMW_tidied" X = vars["X"]; Y = vars["Y"]; elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP" elseif datasetName == "HPLCforweb" X = vars["HPLCforweb"]["data"] elseif datasetName == "leukemia1" X = vars["X"]; G = vars["G"]; elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact" elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW! X = vars["Ramandata"]; Y = vars["PUFAdata"]; waves = vars["Wavelengths_Raman"]; return X, Y, waves elseif datasetName == "NMR_WineData" X = vars["X"] Y = vars["Y"]; ppm = vars["ppm"]; return X, Y, ppm elseif datasetName == "onion_NMR" X = vars["x"] Y = vars["onion"]; ppm = vars["ppm"]; return X, Y, ppm elseif datasetName == "OVALung" X = vars["X"]; G = vars["y"]; return X, G elseif datasetName == "Raman_Adipose_fat_pig_original" X = vars["spectra"] Y = vars["fat"] baseline = vars["baseline"] waves = parse.(Float64,vars["wavelength"]) return X, Y, waves, baseline elseif datasetName == "Ramanmilk" # There are replicates that need to be handled #X = vars["SpectraR"] #Y = vars["CLA"] elseif datasetName == "RAMANPorkFat" X = vars["X"]["data"] Y = vars["Y"]["data"] return X, Y elseif datasetName == "spectra" X = vars["NIR"]; Y = vars["octane"]; waves = 900:2:1700; return X, Y, waves elseif datasetName == "Sugar" X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]]; waves = vars["wave"]; return X, Y, waves elseif datasetName == "tumor11" X = vars["X"]; G = vars["G"]; end end