Added k-fold cv
This commit is contained in:
parent
3254f847e0
commit
f6e7820917
2 changed files with 93 additions and 14 deletions
|
|
@ -28,7 +28,9 @@ export createDataSplitBinaryStratified
|
||||||
export importData
|
export importData
|
||||||
export calculateRMSE
|
export calculateRMSE
|
||||||
export predRegression
|
export predRegression
|
||||||
export modelSelectionStatistics
|
export modelSelection
|
||||||
|
export createCVSplitInds
|
||||||
|
export calculateRMSECV
|
||||||
|
|
||||||
export PCR
|
export PCR
|
||||||
export bidiag2
|
export bidiag2
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,64 @@ XVal = my_split["XVal"];
|
||||||
|
|
||||||
using Random
|
using Random
|
||||||
|
|
||||||
|
"""
|
||||||
|
calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
|
||||||
|
calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6)
|
||||||
|
|
||||||
|
Calculates RMSECV.
|
||||||
|
Second function calculates RMSECV according to data split given by variable 'split' (which should be output of
|
||||||
|
function computeCVSplitInds).
|
||||||
|
|
||||||
|
Returns rmsecv, meanrmsecv, where rmsecv is kmax x n_splits matrix, and meanrmsecv is vector of length kmax.
|
||||||
|
"""
|
||||||
|
function calculateRMSECV(X, y, regfunction, funcargs; n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
|
||||||
|
|
||||||
|
splits = createCVSplitInds(X, n_splits, n_folds, rngseed);
|
||||||
|
rmsecv, n_comps, meanrmsecv = calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc=emscpreproc, emscdegree=emscdegree);
|
||||||
|
|
||||||
|
return rmsecv, n_comps, meanrmsecv
|
||||||
|
end
|
||||||
|
|
||||||
|
function calculateRMSECV(X, y, splits, regfunction, funcargs; emscpreproc=false, emscdegree=6)
|
||||||
|
|
||||||
|
n = size(X, 1);
|
||||||
|
n_splits = size(splits,2);
|
||||||
|
B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions?
|
||||||
|
kmax = size(B, 2);
|
||||||
|
rmsecv = zeros(kmax, n_splits);
|
||||||
|
n_folds = length(unique(splits[:,1]))
|
||||||
|
n_comps = convert(Vector{Int64}, zeros(n_splits));
|
||||||
|
|
||||||
|
for i in 1:n_splits
|
||||||
|
|
||||||
|
for j=1:n_folds
|
||||||
|
|
||||||
|
XTrain = X[splits[:,i] .!= j,:];
|
||||||
|
XTest = X[splits[:,i] .== j,:];
|
||||||
|
yTrain = y[splits[:,i] .!= j,:];
|
||||||
|
yTest = y[splits[:,i] .== j,:];
|
||||||
|
|
||||||
|
if emscpreproc
|
||||||
|
XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF
|
||||||
|
XTest, _ = EMSC(XTest, output["model"]);
|
||||||
|
end
|
||||||
|
|
||||||
|
B, _ = regfunction(XTrain, yTrain, funcargs...);
|
||||||
|
|
||||||
|
for k=1:kmax
|
||||||
|
yTestPred, _ = predRegression(XTest, B[:,k], yTest);
|
||||||
|
rmsecv[k, i] += sum((yTestPred - yTest).^2);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
_, n_comps[i] = findmin(rmsecv[:,i]);
|
||||||
|
end
|
||||||
|
|
||||||
|
rmsecv = sqrt.(rmsecv ./ n);
|
||||||
|
meanrmsecv = mean(rmsecv, dims=2);
|
||||||
|
|
||||||
|
return rmsecv, n_comps, meanrmsecv
|
||||||
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
|
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
|
||||||
|
|
@ -27,7 +84,6 @@ Extra samples if any are assigned to the lower indices.
|
||||||
"""
|
"""
|
||||||
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
|
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
|
||||||
|
|
||||||
|
|
||||||
n = size(X,1)
|
n = size(X,1)
|
||||||
splits = convert(Matrix{Int64}, zeros(n, n_splits)); # fold membership coded as 1, 2, ..., n_folds
|
splits = convert(Matrix{Int64}, zeros(n, n_splits)); # fold membership coded as 1, 2, ..., n_folds
|
||||||
fold_size = convert(Int64, floor(n/n_folds))
|
fold_size = convert(Int64, floor(n/n_folds))
|
||||||
|
|
@ -54,13 +110,32 @@ end
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
function modelSelectionStatistics(results)
|
function modelSelection(results, results_type, selection_rule="min")
|
||||||
|
|
||||||
Takes as input the rmse output from calculateRMSE and returns
|
### Inputs
|
||||||
the number of components minimising the validation error
|
- results : Matrix/Tensor with results, output from calculateRMSE[CV].
|
||||||
together with the test set results.
|
- results_type : "k-fold" or "train-val-test".
|
||||||
|
- selection_rule : "min" only for now. Can add 1 S.E., Chi^2, etc.
|
||||||
|
|
||||||
|
### Outputs:
|
||||||
|
- results_sel : Results for the selected number of components.
|
||||||
|
- n_comps : The number of components chosen for each split.
|
||||||
"""
|
"""
|
||||||
function modelSelectionStatistics(results)
|
function modelSelection(results, results_type, selection_rule="min")
|
||||||
|
|
||||||
|
|
||||||
|
if results_type == "k-fold"
|
||||||
|
|
||||||
|
n_splits = size(results, 2);
|
||||||
|
n_comps = convert(Vector{Int64}, zeros(n_splits));
|
||||||
|
results_sel = zeros(n_splits);
|
||||||
|
|
||||||
|
for i in 1:n_splits
|
||||||
|
_, n_comps[i] = findmin(results[:,i]);
|
||||||
|
results_sel[i] = results[n_comps[i], i];
|
||||||
|
end
|
||||||
|
|
||||||
|
elseif results_type == "train-val-test"
|
||||||
|
|
||||||
n_iter = size(results, 3);
|
n_iter = size(results, 3);
|
||||||
results_sel = zeros(n_iter);
|
results_sel = zeros(n_iter);
|
||||||
|
|
@ -70,12 +145,14 @@ for i=1:n_iter
|
||||||
_, n_comps[i] = findmin(results[2,:,i]);
|
_, n_comps[i] = findmin(results[2,:,i]);
|
||||||
results_sel[i] = results[3, n_comps[i], i];
|
results_sel[i] = results[3, n_comps[i], i];
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
return results_sel, n_comps
|
return results_sel, n_comps
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
function predRegression(X, beta, y)
|
function predRegression(X, beta, y)
|
||||||
function predRegression(X::Vector{Float64}, beta, y)
|
function predRegression(X::Vector{Float64}, beta, y)
|
||||||
|
|
@ -163,7 +240,7 @@ end
|
||||||
|
|
||||||
meanrmse = dropdims(mean(rmse, dims=3), dims=3);
|
meanrmse = dropdims(mean(rmse, dims=3), dims=3);
|
||||||
|
|
||||||
return meanrmse, rmse
|
return rmse, meanrmse
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue