513 lines
No EOL
16 KiB
Julia
513 lines
No EOL
16 KiB
Julia
#=
|
|
X, Y = importData("spectra");
|
|
splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42);
|
|
split = createDataSplit(splits, nTrainValTest, 5, X, Y);
|
|
size(split["XTrain"])
|
|
|
|
###
|
|
|
|
X, G = importData("OVALung");
|
|
splitStrat, nTVT = createDataSplitBinaryStratified(G, 5);
|
|
my_split = createDataSplit(splitStrat, nTVT, 2, X, G);
|
|
XVal = my_split["XVal"];
|
|
=#
|
|
|
|
|
|
using Random
|
|
|
|
"""
|
|
calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
|
|
calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6)
|
|
|
|
Calculates RMSECV.
|
|
Second function calculates RMSECV according to data split given by variable 'split' (which should be output of
|
|
function computeCVSplitInds).
|
|
|
|
Returns rmsecv, meanrmsecv, where rmsecv is kmax x n_splits matrix, and meanrmsecv is vector of length kmax.
|
|
"""
|
|
function calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
|
|
|
|
splits = createCVSplitInds(X, n_splits, n_folds, rngseed);
|
|
rmsecv, meanrmsecv = calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc=emscpreproc, emscdegree=emscdegree);
|
|
|
|
return rmsecv, meanrmsecv
|
|
end
|
|
|
|
function calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6)
|
|
|
|
n = size(X, 1);
|
|
n_splits = size(splits,2);
|
|
B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions?
|
|
kmax = size(B, 2);
|
|
rmsecv = zeros(kmax, n_splits);
|
|
n_folds = length(unique(splits[:,1]))
|
|
|
|
for i in 1:n_splits
|
|
|
|
for j=1:n_folds
|
|
|
|
XTrain = X[splits[:,i] .!= j,:];
|
|
XTest = X[splits[:,i] .== j,:];
|
|
yTrain = y[splits[:,i] .!= j,:];
|
|
yTest = y[splits[:,i] .== j,:];
|
|
|
|
if emscpreproc
|
|
XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF
|
|
XTest, _ = EMSC(XTest, output["model"]);
|
|
end
|
|
|
|
B, _ = regfunction(XTrain, yTrain, funcargs...);
|
|
|
|
for k=1:kmax
|
|
yTestPred, _ = predRegression(XTest, B[:,k], yTest);
|
|
rmsecv[k, i] += sum((yTestPred - yTest).^2);
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
rmsecv = sqrt.(rmsecv ./ n);
|
|
meanrmsecv = mean(rmsecv, dims=2);
|
|
|
|
return rmsecv, meanrmsecv
|
|
end
|
|
|
|
"""
|
|
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
|
|
|
|
Creates n_splits data splits for n_folds cross validation.
|
|
Output is an size(X,1) x n_splits array with type Int64 where
|
|
the columns indicate split membership according to integer (1,...,n_folds).
|
|
Extra samples if any are assigned to the lower indices.
|
|
"""
|
|
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
|
|
|
|
n = size(X,1)
|
|
splits = convert(Matrix{Int64}, zeros(n, n_splits)); # fold membership coded as 1, 2, ..., n_folds
|
|
fold_size = convert(Int64, floor(n/n_folds))
|
|
left_over = n - fold_size * n_folds
|
|
start_vec = convert(Vector{Int64}, zeros(n));
|
|
|
|
for i=1:n_folds
|
|
start_vec[(fold_size*(i-1)+1):fold_size*i] .= i;
|
|
end
|
|
|
|
for i=1:left_over
|
|
start_vec[fold_size*n_folds+i] = i;
|
|
end
|
|
|
|
Random.seed!(rngseed);
|
|
|
|
for i=1:n_splits
|
|
splits[:, i] = shuffle(start_vec);
|
|
end
|
|
|
|
return splits
|
|
end
|
|
|
|
|
|
|
|
"""
|
|
function modelSelection(results, results_type, selection_rule="min")
|
|
|
|
### Inputs
|
|
- results : Matrix/Tensor with results, output from calculateRMSE[CV].
|
|
- results_type : "k-fold" or "train-val-test".
|
|
- selection_rule : "min" only for now. Can add 1 S.E., Chi^2, etc.
|
|
|
|
### Outputs:
|
|
- results_sel : Results for the selected number of components.
|
|
- n_comps : The number of components chosen for each split.
|
|
"""
|
|
function modelSelection(results, results_type, selection_rule="min")
|
|
|
|
|
|
if results_type == "k-fold"
|
|
|
|
n_splits = size(results, 2);
|
|
n_comps = convert(Vector{Int64}, zeros(n_splits));
|
|
results_sel = zeros(n_splits);
|
|
|
|
for i in 1:n_splits
|
|
_, n_comps[i] = findmin(results[:,i]);
|
|
results_sel[i] = results[n_comps[i], i];
|
|
end
|
|
|
|
elseif results_type == "train-val-test"
|
|
|
|
n_iter = size(results, 3);
|
|
results_sel = zeros(n_iter);
|
|
n_comps = convert(Vector{Int64}, zeros(n_iter));
|
|
|
|
for i=1:n_iter
|
|
_, n_comps[i] = findmin(results[2,:,i]);
|
|
results_sel[i] = results[3, n_comps[i], i];
|
|
end
|
|
end
|
|
|
|
return results_sel, n_comps
|
|
end
|
|
|
|
|
|
|
|
|
|
"""
|
|
function predRegression(X, beta, y)
|
|
function predRegression(X::Vector{Float64}, beta, y)
|
|
|
|
Prediction for linear model of form y = X * beta [+ b0]
|
|
Returns ypred, rmsep
|
|
"""
|
|
function predRegression(X, beta, y)
|
|
|
|
if length(beta) == size(X,2)
|
|
ypred = X * beta;
|
|
elseif length(beta) == size(X,2) + 1
|
|
ypred = beta[1] .+ X * beta[2:end]
|
|
end
|
|
|
|
rmsep = sqrt(mean(y - ypred).^2)
|
|
|
|
return ypred, rmsep
|
|
end
|
|
|
|
function predRegression(X::Vector{Float64}, beta, y)
|
|
|
|
if length(beta) == length(X)
|
|
ypred = X' * beta;
|
|
elseif length(beta) == length(X) + 1
|
|
ypred = beta[1] .+ X' * beta[2:end];
|
|
end
|
|
|
|
rmsep = sqrt(mean(y - ypred).^2)
|
|
|
|
return ypred, rmsep
|
|
end
|
|
|
|
|
|
|
|
|
|
"""
|
|
function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42)
|
|
function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs)
|
|
|
|
Calculates rmse for train/val/test sets for regfunction with args funcargs.
|
|
Assume regfunction takes arguments of form "regfunction(X, y, funcargs...)" and first return argument
|
|
is regression coefficients. Second function uses the splits given as input.
|
|
|
|
"""
|
|
function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42, emscpreproc=false, emscdegree=6)
|
|
|
|
splits, nTrainValTest = createDataSplitInds(X, nsplits, props, rngseed);
|
|
meanrmse, rmse = calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc, emscdegree);
|
|
|
|
return meanrmse, rmse
|
|
end
|
|
|
|
function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc=false, emscdegree=6)
|
|
|
|
nsplits = size(splits,2);
|
|
B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions?
|
|
kmax = size(B, 2);
|
|
rmse = zeros(3, kmax, nsplits);
|
|
|
|
for i in 1:nsplits
|
|
split = createDataSplit(splits, nTrainValTest, i, X, y);
|
|
XTrain = split["XTrain"];
|
|
XVal = split["XVal"];
|
|
XTest = split["XTest"];
|
|
yTrain = split["YTrain"];
|
|
yVal = split["YVal"];
|
|
yTest = split["YTest"];
|
|
|
|
if emscpreproc
|
|
XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF
|
|
XVal, _ = EMSC(XVal, output["model"]);
|
|
XTest, _ = EMSC(XTest, output["model"]);
|
|
end
|
|
|
|
B, _ = regfunction(XTrain, yTrain, funcargs...);
|
|
|
|
for j=1:kmax
|
|
_, rmse[1, j, i] = predRegression(XTrain, B[:,j], yTrain);
|
|
_, rmse[2, j, i] = predRegression(XVal, B[:,j], yVal);
|
|
_, rmse[3, j, i] = predRegression(XTest, B[:,j], yTest);
|
|
end
|
|
|
|
end
|
|
|
|
meanrmse = dropdims(mean(rmse, dims=3), dims=3);
|
|
|
|
return rmse, meanrmse
|
|
end
|
|
|
|
|
|
|
|
|
|
"""
|
|
createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
|
createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
|
|
|
|
|
Creates training/validation/test split for dataset
|
|
|
|
### Arguments
|
|
- 'X' : Data matrix or integer indicating number of samples
|
|
- 'nSplits' : Int, number of data splits to create
|
|
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
|
|
- 'rngseed' : Int, for reproducibility
|
|
|
|
### Returns
|
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
|
"""
|
|
function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
|
|
|
n = size(X, 1);
|
|
nTrainValTest = zeros(3);
|
|
splits = convert(Matrix{Int64}, zeros(n, nSplits));
|
|
|
|
if abs(sum(props) - 1) > 1e-14
|
|
error("Error! Proportions do not add up to 1");
|
|
end
|
|
|
|
nTrain = convert(Int64, floor(n * props[1]));
|
|
nVal = convert(Int64, floor(n * props[2]));
|
|
nTest = convert(Int64, floor(n * props[3]));
|
|
nLeft = n - (nTrain + nVal + nTest);
|
|
nTrain = nTrain + nLeft
|
|
|
|
Random.seed!(rngseed);
|
|
|
|
for i in 1:nSplits
|
|
inds = randperm(n);
|
|
|
|
splits[1:nTrain , i] = inds[1:nTrain];
|
|
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
|
|
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
|
|
end
|
|
|
|
nTrainValTest = [nTrain, nVal, nTest];
|
|
|
|
return splits, nTrainValTest
|
|
end
|
|
|
|
function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
|
|
|
n = X;
|
|
nTrainValTest = zeros(3);
|
|
splits = convert(Matrix{Int64}, zeros(n, nSplits));
|
|
|
|
if abs(sum(props) - 1) > 1e-14
|
|
error("Error! Proportions do not add up to 1");
|
|
end
|
|
|
|
nTrain = convert(Int64, floor(n * props[1]));
|
|
nVal = convert(Int64, floor(n * props[2]));
|
|
nTest = convert(Int64, floor(n * props[3]));
|
|
nLeft = n - (nTrain + nVal + nTest);
|
|
nTrain = nTrain + nLeft
|
|
|
|
Random.seed!(rngseed);
|
|
|
|
for i in 1:nSplits
|
|
inds = randperm(n);
|
|
|
|
splits[1:nTrain , i] = inds[1:nTrain];
|
|
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
|
|
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
|
|
end
|
|
|
|
nTrainValTest = [nTrain, nVal, nTest];
|
|
|
|
return splits, nTrainValTest
|
|
end
|
|
|
|
"""
|
|
createDataSplit(splits, nTrainValTest, splitInd, X, Y)
|
|
|
|
Creates training/validation/test split for dataset
|
|
|
|
### Arguments
|
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
|
- 'splitInd' : Index of split to use
|
|
- 'X' and 'Y' : Data matrix and response vector/matrix
|
|
|
|
### Returns dictionary with keys XTrain, XVal, XTest, YTrain, YVal, YTest
|
|
"""
|
|
function createDataSplit(splits, nTrainValTest, splitInd, X, Y)
|
|
|
|
XTrain = X[splits[1 : nTrainValTest[1], splitInd], :];
|
|
XVal = X[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
|
|
XTest = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
|
|
|
|
YTrain = Y[splits[1 : nTrainValTest[1], splitInd], :];
|
|
YVal = Y[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
|
|
YTest = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
|
|
|
|
return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]);
|
|
end
|
|
|
|
"""
|
|
createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
|
|
|
Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2
|
|
|
|
### Arguments
|
|
- 'G' : Vector{Int64}, group membership coded as 1, 2
|
|
- 'nSplits' : Int, number of data splits
|
|
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
|
|
- 'rngseed' : Int, for reproducibility
|
|
|
|
### Returns
|
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
|
"""
|
|
function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
|
|
|
n = length(G);
|
|
splits = convert(Matrix{Int64}, zeros(n, nSplits));
|
|
|
|
indsG1 = findall( @. abs(G - 1.0) < 1e-14)
|
|
indsG2 = findall( @. abs(G - 2.0) < 1e-14)
|
|
|
|
nG1 = length(indsG1);
|
|
nG2 = length(indsG2);
|
|
|
|
propG1 = nG1 / n
|
|
propG2 = 1 - propG1;
|
|
|
|
nTrain1 = convert(Int64, floor(n * props[1] * propG1));
|
|
nVal1 = convert(Int64, floor(n * props[2] * propG1));
|
|
nTest1 = convert(Int64, floor(n * props[3] * propG1));
|
|
nTrain2 = convert(Int64, floor(n * props[1] * propG2));
|
|
nVal2 = convert(Int64, floor(n * props[2] * propG2));
|
|
nTest2 = convert(Int64, floor(n * props[3] * propG2));
|
|
|
|
# Unassigned samples (if any) are added to the training set)
|
|
nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1);
|
|
nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2);
|
|
|
|
nTrain1 = nTrain1 + nLeftG1;
|
|
nTrain2 = nTrain2 + nLeftG2;
|
|
|
|
nTrain = nTrain1 + nTrain2;
|
|
nVal = nVal1 + nVal2;
|
|
nTest = nTest1 + nTest2;
|
|
|
|
if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n
|
|
error("Error! Some samples are not assigned!");
|
|
end
|
|
|
|
Random.seed!(rngseed);
|
|
|
|
for i in 1:nSplits
|
|
G1IndsShuffle = randperm(nG1);
|
|
G2IndsShuffle = randperm(nG2);
|
|
|
|
splits[1 : nTrain , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]];
|
|
splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]];
|
|
splits[nTrain+nVal+1 : end , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60
|
|
end
|
|
|
|
nTrainValTest = [nTrain, nVal, nTest];
|
|
|
|
return splits, nTrainValTest
|
|
end
|
|
|
|
|
|
|
|
|
|
"""
|
|
importData(datasetName, datasetPath="/home/jovyan/Datasets/")
|
|
|
|
Example: X, y, waves = importData("Beer");
|
|
Returns X, Y/G, [waves], +more for some datasets
|
|
Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb,
|
|
leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original,
|
|
Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11
|
|
"""
|
|
function importData(datasetName, datasetPath="/home/jovyan/Datasets/")
|
|
|
|
#datasetName = "Beer"
|
|
#datasetPath = "/home/jovyan/Datasets/"
|
|
vars = matread(string(datasetPath,datasetName,".mat"));
|
|
|
|
if datasetName == "9tumors"
|
|
X = vars["X"];
|
|
G = vars["G"];
|
|
elseif datasetName == "bacteria_K4_K5_K6"
|
|
# Causes crash for some reason
|
|
elseif datasetName == "Beer"
|
|
X = vars["X"];
|
|
Y = vars["Y"]; # G for some datasets
|
|
waves = 400:2:2250;
|
|
return X, Y, waves
|
|
elseif datasetName == "Dough"
|
|
X = [vars["Xtrain"]; vars["Xtest"]];
|
|
Y = [vars["Ytrain"]; vars["Ytest"]];
|
|
#if response != "all"
|
|
# Y = Y[:,response];
|
|
# Y = dropdims(Y;dims=2);
|
|
#end
|
|
return X, Y
|
|
elseif datasetName == "FTIR_AMW_tidied"
|
|
X = vars["X"];
|
|
Y = vars["Y"];
|
|
elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP"
|
|
|
|
elseif datasetName == "HPLCforweb"
|
|
X = vars["HPLCforweb"]["data"]
|
|
elseif datasetName == "leukemia1"
|
|
X = vars["X"];
|
|
G = vars["G"];
|
|
elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact"
|
|
|
|
elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW!
|
|
X = vars["Ramandata"];
|
|
Y = vars["PUFAdata"];
|
|
waves = vars["Wavelengths_Raman"];
|
|
return X, Y, waves
|
|
elseif datasetName == "NMR_WineData"
|
|
X = vars["X"]
|
|
Y = vars["Y"];
|
|
ppm = vars["ppm"];
|
|
return X, Y, ppm
|
|
elseif datasetName == "onion_NMR"
|
|
X = vars["x"]
|
|
Y = vars["onion"];
|
|
ppm = vars["ppm"];
|
|
return X, Y, ppm
|
|
elseif datasetName == "OVALung"
|
|
X = vars["X"];
|
|
G = vars["y"];
|
|
return X, G
|
|
elseif datasetName == "Raman_Adipose_fat_pig_original"
|
|
X = vars["spectra"]
|
|
Y = vars["fat"]
|
|
baseline = vars["baseline"]
|
|
waves = parse.(Float64,vars["wavelength"])
|
|
return X, Y, waves, baseline
|
|
elseif datasetName == "Ramanmilk"
|
|
# There are replicates that need to be handled
|
|
#X = vars["SpectraR"]
|
|
#Y = vars["CLA"]
|
|
elseif datasetName == "RAMANPorkFat"
|
|
X = vars["X"]["data"]
|
|
Y = vars["Y"]["data"]
|
|
return X, Y
|
|
elseif datasetName == "spectra"
|
|
X = vars["NIR"];
|
|
Y = vars["octane"];
|
|
waves = 900:2:1700;
|
|
return X, Y, waves
|
|
elseif datasetName == "Sugar"
|
|
X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]];
|
|
waves = vars["wave"];
|
|
return X, Y, waves
|
|
elseif datasetName == "tumor11"
|
|
X = vars["X"];
|
|
G = vars["G"];
|
|
end
|
|
end |