MinPakke.jl/src/convenience.jl
2023-05-13 07:52:47 +02:00

513 lines
No EOL
16 KiB
Julia

#=
X, Y = importData("spectra");
splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42);
split = createDataSplit(splits, nTrainValTest, 5, X, Y);
size(split["XTrain"])
###
X, G = importData("OVALung");
splitStrat, nTVT = createDataSplitBinaryStratified(G, 5);
my_split = createDataSplit(splitStrat, nTVT, 2, X, G);
XVal = my_split["XVal"];
=#
using Random
"""
calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6)
Calculates RMSECV.
Second function calculates RMSECV according to data split given by variable 'split' (which should be output of
function computeCVSplitInds).
Returns rmsecv, meanrmsecv, where rmsecv is kmax x n_splits matrix, and meanrmsecv is vector of length kmax.
"""
function calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
splits = createCVSplitInds(X, n_splits, n_folds, rngseed);
rmsecv, meanrmsecv = calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc=emscpreproc, emscdegree=emscdegree);
return rmsecv, meanrmsecv
end
function calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6)
n = size(X, 1);
n_splits = size(splits,2);
B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions?
kmax = size(B, 2);
rmsecv = zeros(kmax, n_splits);
n_folds = length(unique(splits[:,1]))
for i in 1:n_splits
for j=1:n_folds
XTrain = X[splits[:,i] .!= j,:];
XTest = X[splits[:,i] .== j,:];
yTrain = y[splits[:,i] .!= j,:];
yTest = y[splits[:,i] .== j,:];
if emscpreproc
XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF
XTest, _ = EMSC(XTest, output["model"]);
end
B, _ = regfunction(XTrain, yTrain, funcargs...);
for k=1:kmax
yTestPred, _ = predRegression(XTest, B[:,k], yTest);
rmsecv[k, i] += sum((yTestPred - yTest).^2);
end
end
end
rmsecv = sqrt.(rmsecv ./ n);
meanrmsecv = mean(rmsecv, dims=2);
return rmsecv, meanrmsecv
end
"""
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
Creates n_splits data splits for n_folds cross validation.
Output is an size(X,1) x n_splits array with type Int64 where
the columns indicate split membership according to integer (1,...,n_folds).
Extra samples if any are assigned to the lower indices.
"""
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
n = size(X,1)
splits = convert(Matrix{Int64}, zeros(n, n_splits)); # fold membership coded as 1, 2, ..., n_folds
fold_size = convert(Int64, floor(n/n_folds))
left_over = n - fold_size * n_folds
start_vec = convert(Vector{Int64}, zeros(n));
for i=1:n_folds
start_vec[(fold_size*(i-1)+1):fold_size*i] .= i;
end
for i=1:left_over
start_vec[fold_size*n_folds+i] = i;
end
Random.seed!(rngseed);
for i=1:n_splits
splits[:, i] = shuffle(start_vec);
end
return splits
end
"""
function modelSelection(results, results_type, selection_rule="min")
### Inputs
- results : Matrix/Tensor with results, output from calculateRMSE[CV].
- results_type : "k-fold" or "train-val-test".
- selection_rule : "min" only for now. Can add 1 S.E., Chi^2, etc.
### Outputs:
- results_sel : Results for the selected number of components.
- n_comps : The number of components chosen for each split.
"""
function modelSelection(results, results_type, selection_rule="min")
if results_type == "k-fold"
n_splits = size(results, 2);
n_comps = convert(Vector{Int64}, zeros(n_splits));
results_sel = zeros(n_splits);
for i in 1:n_splits
_, n_comps[i] = findmin(results[:,i]);
results_sel[i] = results[n_comps[i], i];
end
elseif results_type == "train-val-test"
n_iter = size(results, 3);
results_sel = zeros(n_iter);
n_comps = convert(Vector{Int64}, zeros(n_iter));
for i=1:n_iter
_, n_comps[i] = findmin(results[2,:,i]);
results_sel[i] = results[3, n_comps[i], i];
end
end
return results_sel, n_comps
end
"""
function predRegression(X, beta, y)
function predRegression(X::Vector{Float64}, beta, y)
Prediction for linear model of form y = X * beta [+ b0]
Returns ypred, rmsep
"""
function predRegression(X, beta, y)
if length(beta) == size(X,2)
ypred = X * beta;
elseif length(beta) == size(X,2) + 1
ypred = beta[1] .+ X * beta[2:end]
end
rmsep = sqrt(mean(y - ypred).^2)
return ypred, rmsep
end
function predRegression(X::Vector{Float64}, beta, y)
if length(beta) == length(X)
ypred = X' * beta;
elseif length(beta) == length(X) + 1
ypred = beta[1] .+ X' * beta[2:end];
end
rmsep = sqrt(mean(y - ypred).^2)
return ypred, rmsep
end
"""
function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42)
function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs)
Calculates rmse for train/val/test sets for regfunction with args funcargs.
Assume regfunction takes arguments of form "regfunction(X, y, funcargs...)" and first return argument
is regression coefficients. Second function uses the splits given as input.
"""
function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42, emscpreproc=false, emscdegree=6)
splits, nTrainValTest = createDataSplitInds(X, nsplits, props, rngseed);
meanrmse, rmse = calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc, emscdegree);
return meanrmse, rmse
end
function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc=false, emscdegree=6)
nsplits = size(splits,2);
B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions?
kmax = size(B, 2);
rmse = zeros(3, kmax, nsplits);
for i in 1:nsplits
split = createDataSplit(splits, nTrainValTest, i, X, y);
XTrain = split["XTrain"];
XVal = split["XVal"];
XTest = split["XTest"];
yTrain = split["YTrain"];
yVal = split["YVal"];
yTest = split["YTest"];
if emscpreproc
XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF
XVal, _ = EMSC(XVal, output["model"]);
XTest, _ = EMSC(XTest, output["model"]);
end
B, _ = regfunction(XTrain, yTrain, funcargs...);
for j=1:kmax
_, rmse[1, j, i] = predRegression(XTrain, B[:,j], yTrain);
_, rmse[2, j, i] = predRegression(XVal, B[:,j], yVal);
_, rmse[3, j, i] = predRegression(XTest, B[:,j], yTest);
end
end
meanrmse = dropdims(mean(rmse, dims=3), dims=3);
return rmse, meanrmse
end
"""
createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
Creates training/validation/test split for dataset
### Arguments
- 'X' : Data matrix or integer indicating number of samples
- 'nSplits' : Int, number of data splits to create
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
- 'rngseed' : Int, for reproducibility
### Returns
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
"""
function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
n = size(X, 1);
nTrainValTest = zeros(3);
splits = convert(Matrix{Int64}, zeros(n, nSplits));
if abs(sum(props) - 1) > 1e-14
error("Error! Proportions do not add up to 1");
end
nTrain = convert(Int64, floor(n * props[1]));
nVal = convert(Int64, floor(n * props[2]));
nTest = convert(Int64, floor(n * props[3]));
nLeft = n - (nTrain + nVal + nTest);
nTrain = nTrain + nLeft
Random.seed!(rngseed);
for i in 1:nSplits
inds = randperm(n);
splits[1:nTrain , i] = inds[1:nTrain];
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
end
nTrainValTest = [nTrain, nVal, nTest];
return splits, nTrainValTest
end
function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
n = X;
nTrainValTest = zeros(3);
splits = convert(Matrix{Int64}, zeros(n, nSplits));
if abs(sum(props) - 1) > 1e-14
error("Error! Proportions do not add up to 1");
end
nTrain = convert(Int64, floor(n * props[1]));
nVal = convert(Int64, floor(n * props[2]));
nTest = convert(Int64, floor(n * props[3]));
nLeft = n - (nTrain + nVal + nTest);
nTrain = nTrain + nLeft
Random.seed!(rngseed);
for i in 1:nSplits
inds = randperm(n);
splits[1:nTrain , i] = inds[1:nTrain];
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
end
nTrainValTest = [nTrain, nVal, nTest];
return splits, nTrainValTest
end
"""
createDataSplit(splits, nTrainValTest, splitInd, X, Y)
Creates training/validation/test split for dataset
### Arguments
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
- 'splitInd' : Index of split to use
- 'X' and 'Y' : Data matrix and response vector/matrix
### Returns dictionary with keys XTrain, XVal, XTest, YTrain, YVal, YTest
"""
function createDataSplit(splits, nTrainValTest, splitInd, X, Y)
XTrain = X[splits[1 : nTrainValTest[1], splitInd], :];
XVal = X[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
XTest = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
YTrain = Y[splits[1 : nTrainValTest[1], splitInd], :];
YVal = Y[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
YTest = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]);
end
"""
createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2
### Arguments
- 'G' : Vector{Int64}, group membership coded as 1, 2
- 'nSplits' : Int, number of data splits
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
- 'rngseed' : Int, for reproducibility
### Returns
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
"""
function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
n = length(G);
splits = convert(Matrix{Int64}, zeros(n, nSplits));
indsG1 = findall( @. abs(G - 1.0) < 1e-14)
indsG2 = findall( @. abs(G - 2.0) < 1e-14)
nG1 = length(indsG1);
nG2 = length(indsG2);
propG1 = nG1 / n
propG2 = 1 - propG1;
nTrain1 = convert(Int64, floor(n * props[1] * propG1));
nVal1 = convert(Int64, floor(n * props[2] * propG1));
nTest1 = convert(Int64, floor(n * props[3] * propG1));
nTrain2 = convert(Int64, floor(n * props[1] * propG2));
nVal2 = convert(Int64, floor(n * props[2] * propG2));
nTest2 = convert(Int64, floor(n * props[3] * propG2));
# Unassigned samples (if any) are added to the training set)
nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1);
nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2);
nTrain1 = nTrain1 + nLeftG1;
nTrain2 = nTrain2 + nLeftG2;
nTrain = nTrain1 + nTrain2;
nVal = nVal1 + nVal2;
nTest = nTest1 + nTest2;
if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n
error("Error! Some samples are not assigned!");
end
Random.seed!(rngseed);
for i in 1:nSplits
G1IndsShuffle = randperm(nG1);
G2IndsShuffle = randperm(nG2);
splits[1 : nTrain , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]];
splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]];
splits[nTrain+nVal+1 : end , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60
end
nTrainValTest = [nTrain, nVal, nTest];
return splits, nTrainValTest
end
"""
importData(datasetName, datasetPath="/home/jovyan/Datasets/")
Example: X, y, waves = importData("Beer");
Returns X, Y/G, [waves], +more for some datasets
Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb,
leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original,
Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11
"""
function importData(datasetName, datasetPath="/home/jovyan/Datasets/")
#datasetName = "Beer"
#datasetPath = "/home/jovyan/Datasets/"
vars = matread(string(datasetPath,datasetName,".mat"));
if datasetName == "9tumors"
X = vars["X"];
G = vars["G"];
elseif datasetName == "bacteria_K4_K5_K6"
# Causes crash for some reason
elseif datasetName == "Beer"
X = vars["X"];
Y = vars["Y"]; # G for some datasets
waves = 400:2:2250;
return X, Y, waves
elseif datasetName == "Dough"
X = [vars["Xtrain"]; vars["Xtest"]];
Y = [vars["Ytrain"]; vars["Ytest"]];
#if response != "all"
# Y = Y[:,response];
# Y = dropdims(Y;dims=2);
#end
return X, Y
elseif datasetName == "FTIR_AMW_tidied"
X = vars["X"];
Y = vars["Y"];
elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP"
elseif datasetName == "HPLCforweb"
X = vars["HPLCforweb"]["data"]
elseif datasetName == "leukemia1"
X = vars["X"];
G = vars["G"];
elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact"
elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW!
X = vars["Ramandata"];
Y = vars["PUFAdata"];
waves = vars["Wavelengths_Raman"];
return X, Y, waves
elseif datasetName == "NMR_WineData"
X = vars["X"]
Y = vars["Y"];
ppm = vars["ppm"];
return X, Y, ppm
elseif datasetName == "onion_NMR"
X = vars["x"]
Y = vars["onion"];
ppm = vars["ppm"];
return X, Y, ppm
elseif datasetName == "OVALung"
X = vars["X"];
G = vars["y"];
return X, G
elseif datasetName == "Raman_Adipose_fat_pig_original"
X = vars["spectra"]
Y = vars["fat"]
baseline = vars["baseline"]
waves = parse.(Float64,vars["wavelength"])
return X, Y, waves, baseline
elseif datasetName == "Ramanmilk"
# There are replicates that need to be handled
#X = vars["SpectraR"]
#Y = vars["CLA"]
elseif datasetName == "RAMANPorkFat"
X = vars["X"]["data"]
Y = vars["Y"]["data"]
return X, Y
elseif datasetName == "spectra"
X = vars["NIR"];
Y = vars["octane"];
waves = 900:2:1700;
return X, Y, waves
elseif datasetName == "Sugar"
X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]];
waves = vars["wave"];
return X, Y, waves
elseif datasetName == "tumor11"
X = vars["X"];
G = vars["G"];
end
end