Started adding cv, but some mistakes somewhere

This commit is contained in:
Joakim Skogholt 2023-05-11 21:43:52 +02:00
parent f7350649be
commit 418307a666

View file

@ -16,6 +16,99 @@ XVal = my_split["XVal"];
using Random using Random
"""
calculateRMSECV(X, y, regfunction, funcargs, n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc=false, emscdegree=6)
Calculates RMSECV.
Second function calculates RMSECV according to data split given by variable 'split' (which should be output of
function computeCVSplitInds).
Returns meanrmsecv, rmsecv where rmsecv is kmax x n_splits matrix, and meanrmsecv is vector of length kmax.
"""
function calculateRMSECV(X, y, regfunction, funcargs, n_splits=1, n_folds=5, rngseed=42, emscpreproc=false, emscdegree=6)
splits = createCVSplitInds(X, n_splits, n_folds, rngseed);
meanrmse, rmse = calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc, emscdegree);
return meanrmsecv, rmsecv
end
function calculateRMSECV(X, y, splits, regfunction, funcargs, emscpreproc=false, emscdegree=6)
n_splits = size(splits,2);
B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions?
println("her")
kmax = size(B, 2);
rmsecv = zeros(kmax, nsplits);
n_folds = length(unique(splits[:,1]))
for i in 1:n_splits
for j=1:n_folds
XTrain = X[splits[:,j] .!= j,:];
XTest = X[splits[:,j] .== j,:];
yTrain = y[splits[:,j] .!= j,:];
yTest = y[splits[:,j] .== j,:];
if emscpreproc
XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF
XTest, _ = EMSC(XTest, output["model"]);
end
B, _ = regfunction(XTrain, yTrain, funcargs...);
for k=1:kmax
yTestPred, _ = predRegression(XTest, B[:,k], yTest);
rmsecv[k, i] += sum((yTestPred - yTest).^2);
end
end
end
rmsecv = sqrt.(rmsecv ./ n);
meanrmsecv = mean(rmsecv, dims=2);
return meanrmsecv, rmsecv
end
"""
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
Creates n_splits data splits for n_folds cross validation.
Output is an size(X,1) x n_splits array with type Int64 where
the columns indicate split membership according to integer (1,...,n_folds).
Extra samples if any are assigned to the lower indices.
"""
function createCVSplitInds(X, n_splits=1, n_folds=5, rngseed=42)
n = size(X,1)
splits = convert(Matrix{Int64}, zeros(n, n_splits)); # fold membership coded as 1, 2, ..., n_folds
fold_size = convert(Int64, floor(n/n_folds))
left_over = n - fold_size * n_folds
start_vec = convert(Vector{Int64}, zeros(n));
for i=1:n_folds
start_vec[(fold_size*(i-1)+1):fold_size*i] .= i;
end
for i=1:left_over
start_vec[fold_size*n_folds+i] = i;
end
Random.seed!(rngseed);
for i=1:n_splits
splits[:, i] = shuffle(start_vec);
end
return splits
end
""" """
function modelSelectionStatistics(results) function modelSelectionStatistics(results)