diff --git a/src/MinPakke.jl b/src/MinPakke.jl index 86e2291..d84a472 100644 --- a/src/MinPakke.jl +++ b/src/MinPakke.jl @@ -28,7 +28,9 @@ export createDataSplitBinaryStratified export importData export calculateRMSE export predRegression -export modelSelectionStatistics +export modelSelection +export createCVSplitInds +export calculateRMSECV export PCR export bidiag2 diff --git a/src/convenience.jl b/src/convenience.jl index 346146d..3576960 100644 --- a/src/convenience.jl +++ b/src/convenience.jl @@ -15,7 +15,64 @@ XVal = my_split["XVal"]; using Random +""" + calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6) + calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6) +Calculates RMSECV. +Second function calculates RMSECV according to data split given by variable 'split' (which should be output of +function computeCVSplitInds). + +Returns rmsecv, meanrmsecv, where rmsecv is kmax x n_splits matrix, and meanrmsecv is vector of length kmax. +""" +function calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6) + +splits = createCVSplitInds(X, n_splits, n_folds, rngseed); +rmsecv, n_comps, meanrmsecv = calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc=emscpreproc, emscdegree=emscdegree); + +return rmsecv, n_comps, meanrmsecv +end + +function calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6) + +n = size(X, 1); +n_splits = size(splits,2); +B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions? +kmax = size(B, 2); +rmsecv = zeros(kmax, n_splits); +n_folds = length(unique(splits[:,1])) +n_comps = convert(Vector{Int64}, zeros(n_splits)); + +for i in 1:n_splits + + for j=1:n_folds + + XTrain = X[splits[:,i] .!= j,:]; + XTest = X[splits[:,i] .== j,:]; + yTrain = y[splits[:,i] .!= j,:]; + yTest = y[splits[:,i] .== j,:]; + + if emscpreproc + XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF + XTest, _ = EMSC(XTest, output["model"]); + end + + B, _ = regfunction(XTrain, yTrain, funcargs...); + + for k=1:kmax + yTestPred, _ = predRegression(XTest, B[:,k], yTest); + rmsecv[k, i] += sum((yTestPred - yTest).^2); + end + end + + _, n_comps[i] = findmin(rmsecv[:,i]); +end + +rmsecv = sqrt.(rmsecv ./ n); +meanrmsecv = mean(rmsecv, dims=2); + +return rmsecv, n_comps, meanrmsecv +end """ function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42) @@ -27,7 +84,6 @@ Extra samples if any are assigned to the lower indices. """ function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42) - n = size(X,1) splits = convert(Matrix{Int64}, zeros(n, n_splits)); # fold membership coded as 1, 2, ..., n_folds fold_size = convert(Int64, floor(n/n_folds)) @@ -54,21 +110,41 @@ end """ - function modelSelectionStatistics(results) + function modelSelection(results, results_type, selection_rule="min") -Takes as input the rmse output from calculateRMSE and returns -the number of components minimising the validation error -together with the test set results. +### Inputs + - results : Matrix/Tensor with results, output from calculateRMSE[CV]. + - results_type : "k-fold" or "train-val-test". + - selection_rule : "min" only for now. Can add 1 S.E., Chi^2, etc. + +### Outputs: + - results_sel : Results for the selected number of components. + - n_comps : The number of components chosen for each split. """ -function modelSelectionStatistics(results) +function modelSelection(results, results_type, selection_rule="min") + + +if results_type == "k-fold" + + n_splits = size(results, 2); + n_comps = convert(Vector{Int64}, zeros(n_splits)); + results_sel = zeros(n_splits); + + for i in 1:n_splits + _, n_comps[i] = findmin(results[:,i]); + results_sel[i] = results[n_comps[i], i]; + end + +elseif results_type == "train-val-test" -n_iter = size(results, 3); -results_sel = zeros(n_iter); -n_comps = convert(Vector{Int64}, zeros(n_iter)); + n_iter = size(results, 3); + results_sel = zeros(n_iter); + n_comps = convert(Vector{Int64}, zeros(n_iter)); -for i=1:n_iter - _, n_comps[i] = findmin(results[2,:,i]); - results_sel[i] = results[3, n_comps[i], i]; + for i=1:n_iter + _, n_comps[i] = findmin(results[2,:,i]); + results_sel[i] = results[3, n_comps[i], i]; + end end return results_sel, n_comps @@ -76,6 +152,7 @@ end + """ function predRegression(X, beta, y) function predRegression(X::Vector{Float64}, beta, y) @@ -163,7 +240,7 @@ end meanrmse = dropdims(mean(rmse, dims=3), dims=3); -return meanrmse, rmse +return rmse, meanrmse end