Added lots of stuff

This commit is contained in:
Joakim Skogholt 2023-05-10 21:49:25 +02:00
parent c9644c0f6d
commit 4ad7d0c8d6
4 changed files with 178 additions and 3 deletions

View file

@ -28,6 +28,7 @@ export createDataSplitBinaryStratified
export importData export importData
export PCR export PCR
export bidiag2
include("preprocessing.jl") include("preprocessing.jl")

View file

@ -17,6 +17,103 @@ using Random
"""
function predRegression(X, beta, y)
function predRegression(X::Vector{Float64}, beta, y)
Prediction for linear model of form y = X * beta [+ b0]
Returns ypred, rmsep
"""
function predRegression(X, beta, y)
if length(beta) == size(X,2)
ypred = X * beta;
elseif length(beta) == size(X,2) + 1
ypred = beta[1] .+ X * beta[2:end]
end
rmsep = sqrt(mean(y - ypred).^2)
return ypred, rmsep
end
function predRegression(X::Vector{Float64}, beta, y)
if length(beta) == length(X)
ypred = X' * beta;
elseif length(beta) == length(X) + 1
ypred = beta[1] .+ X' * beta[2:end];
end
rmsep = sqrt(mean(y - ypred).^2)
return ypred, rmsep
end
"""
function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42)
function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs)
Calculates rmse for train/val/test sets for regfunction with args funcargs.
Assume regfunction takes arguments of form "regfunction(X, y, funcargs...)" and first return argument
is regression coefficients. Second function uses the splits given as input.
"""
function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42, emscpreproc=false, emscdegree=6)
splits, nTrainValTest = createDataSplitInds(X, nsplits, props, rngseed);
meanrmse, rmse = calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc, emscdegree);
return meanrmse, rmse
end
function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc=false, emscdegree=6)
nsplits = size(splits,2);
B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions?
kmax = size(B, 2);
rmse = zeros(3, kmax, nsplits);
for i in 1:nsplits
split = createDataSplit(splits, nTrainValTest, i, X, y);
XTrain = split["XTrain"];
XVal = split["XVal"];
XTest = split["XTest"];
yTrain = split["YTrain"];
yVal = split["YVal"];
yTest = split["YTest"];
if emscpreproc
XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF
XVal, _ = EMSC(XVal, output["model"]);
XTest, _ = EMSC(XTest, output["model"]);
end
B, _ = regfunction(XTrain, yTrain, funcargs...);
for j=1:kmax
_, rmse[1, j, i] = predRegression(XTrain, B[:,j], yTrain);
_, rmse[2, j, i] = predRegression(XVal, B[:,j], yVal);
_, rmse[3, j, i] = predRegression(XTest, B[:,j], yTest);
end
end
meanrmse = dropdims(mean(rmse, dims=3), dims=3);
return meanrmse, rmse
end
""" """
createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)

View file

@ -135,6 +135,7 @@ refSpec = mean(X, dims=1)';
basis = [refSpec P]; basis = [refSpec P];
return_values = EMSCTraditional(X, basis); return_values = EMSCTraditional(X, basis);
return return_values return return_values
end end
@ -151,8 +152,7 @@ function EMSCTraditional(X, basis::Matrix{Float64})
coeffs = basis \ X'; coeffs = basis \ X';
X_Cor = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:]; X_Cor = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:];
return_values = Dict([("X_Cor", X_Cor), ("basis", basis), ("coeffs", coeffs)]); return X_Cor, Dict([("basis", basis), ("coeffs", coeffs)]);
return return_values
end end

View file

@ -32,3 +32,80 @@ B = [b0; B];
return B, U, s, V return B, U, s, V
end end
"""
bidiag2(X, y, A)
Julia version of the bidiag2 MATLAB function in Björck and Indahl (2017)
### Arguments
- 'X' - Data matrix of predictors
- 'y' - Vector of responses
- 'A' - Number of components
### Returns:
- beta - Matrix with regression coefficients (including constant term if centre==true)
- W, T, B - Matrices such that X \approx T*B*W'.
B in compact form, use following to create full: B2 = zeros(A,A); B2[diagind(B2)] = B[:,1]; B2[diagind(B2,1)] = B[2:end,2];
"""
function bidiag2(X, y, A, centre=true)
n, p = size(X);
w = zeros(p, 1);
W = zeros(p, A);
t = zeros(n, 1);
T = zeros(n, A);
P = zeros(p, A);
beta = zeros(p, A);
B = zeros(A,2);
mX = mean(X, dims=1);
my = mean(y, dims=1);
y = y .- my;
if centre
X = X .- mX;
end
w = X'y;
w = w / norm(w);
W[:,1] = w;
t = X*w;
rho = norm(t);
t = t / rho;
T[:,1] = t;
B[1,1] = rho;
d = w / rho;
beta[:,1] = (t'y) .* d;
for a in 2:A
w = X'*t - rho * w;
w = w - W*(W'*w);
theta = norm(w);
w = w / theta;
W[:,a] = w;
t = X * w - theta * t;
t = t - T*(T'*t);
rho = norm(t);
t = t / rho;
T[:,a] = t;
B[a-1, 2] = theta;
B[a, 1] = rho;
d = (w - theta*d) / rho;
beta[:,a] = beta[:,a-1] + (t'y) .* d;
end
if centre
b0 = my .- mX * beta
beta = [b0; beta];
end
returnvals = beta, W, T, B
return returnvals
end