From 4ad7d0c8d6d609342a23e3f18c785d624040d23d Mon Sep 17 00:00:00 2001 From: Joakim Skogholt Date: Wed, 10 May 2023 21:49:25 +0200 Subject: [PATCH] Added lots of stuff --- src/MinPakke.jl | 1 + src/convenience.jl | 97 +++++++++++++++++++++++++++++++ src/preprocessing.jl | 6 +- src/variousRegressionFunctions.jl | 77 ++++++++++++++++++++++++ 4 files changed, 178 insertions(+), 3 deletions(-) diff --git a/src/MinPakke.jl b/src/MinPakke.jl index f8c4ab0..118b22d 100644 --- a/src/MinPakke.jl +++ b/src/MinPakke.jl @@ -28,6 +28,7 @@ export createDataSplitBinaryStratified export importData export PCR +export bidiag2 include("preprocessing.jl") diff --git a/src/convenience.jl b/src/convenience.jl index 4712aa0..3c9468c 100644 --- a/src/convenience.jl +++ b/src/convenience.jl @@ -17,6 +17,103 @@ using Random + + + + +""" + function predRegression(X, beta, y) + function predRegression(X::Vector{Float64}, beta, y) + +Prediction for linear model of form y = X * beta [+ b0] +Returns ypred, rmsep +""" +function predRegression(X, beta, y) + +if length(beta) == size(X,2) + ypred = X * beta; +elseif length(beta) == size(X,2) + 1 + ypred = beta[1] .+ X * beta[2:end] +end + +rmsep = sqrt(mean(y - ypred).^2) + +return ypred, rmsep +end + +function predRegression(X::Vector{Float64}, beta, y) + +if length(beta) == length(X) + ypred = X' * beta; +elseif length(beta) == length(X) + 1 + ypred = beta[1] .+ X' * beta[2:end]; +end + +rmsep = sqrt(mean(y - ypred).^2) + +return ypred, rmsep +end + + + + +""" + function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42) + function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs) + +Calculates rmse for train/val/test sets for regfunction with args funcargs. +Assume regfunction takes arguments of form "regfunction(X, y, funcargs...)" and first return argument +is regression coefficients. Second function uses the splits given as input. + +""" +function calculateRMSE(X, y, regfunction, funcargs, nsplits=1, props=[6/10, 2/10, 2/10], rngseed=42, emscpreproc=false, emscdegree=6) + +splits, nTrainValTest = createDataSplitInds(X, nsplits, props, rngseed); +meanrmse, rmse = calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc, emscdegree); + +return meanrmse, rmse +end + +function calculateRMSE(X, y, splits, nTrainValTest, regfunction, funcargs, emscpreproc=false, emscdegree=6) + +nsplits = size(splits,2); +B, _ = regfunction(X, y, funcargs...); # <- Slow, but works in general. Maybe add some special cases for known functions? +kmax = size(B, 2); +rmse = zeros(3, kmax, nsplits); + +for i in 1:nsplits + split = createDataSplit(splits, nTrainValTest, i, X, y); + XTrain = split["XTrain"]; + XVal = split["XVal"]; + XTest = split["XTest"]; + yTrain = split["YTrain"]; + yVal = split["YVal"]; + yTest = split["YTest"]; + + if emscpreproc + XTrain, output = EMSC(XTrain, emscdegree, "svd", 1, -1, 0); # nRef, baseDeg, intF + XVal, _ = EMSC(XVal, output["model"]); + XTest, _ = EMSC(XTest, output["model"]); + end + + B, _ = regfunction(XTrain, yTrain, funcargs...); + + for j=1:kmax + _, rmse[1, j, i] = predRegression(XTrain, B[:,j], yTrain); + _, rmse[2, j, i] = predRegression(XVal, B[:,j], yVal); + _, rmse[3, j, i] = predRegression(XTest, B[:,j], yTest); + end + +end + +meanrmse = dropdims(mean(rmse, dims=3), dims=3); + +return meanrmse, rmse +end + + + + """ createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) diff --git a/src/preprocessing.jl b/src/preprocessing.jl index d193dfa..62ef3ab 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -135,6 +135,7 @@ refSpec = mean(X, dims=1)'; basis = [refSpec P]; return_values = EMSCTraditional(X, basis); + return return_values end @@ -150,9 +151,8 @@ function EMSCTraditional(X, basis::Matrix{Float64}) coeffs = basis \ X'; X_Cor = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:]; - -return_values = Dict([("X_Cor", X_Cor), ("basis", basis), ("coeffs", coeffs)]); -return return_values + +return X_Cor, Dict([("basis", basis), ("coeffs", coeffs)]); end diff --git a/src/variousRegressionFunctions.jl b/src/variousRegressionFunctions.jl index 2e4b17b..4222aa7 100644 --- a/src/variousRegressionFunctions.jl +++ b/src/variousRegressionFunctions.jl @@ -31,4 +31,81 @@ b0 = my .- mX * B B = [b0; B]; return B, U, s, V +end + + + + + +""" + bidiag2(X, y, A) + +Julia version of the bidiag2 MATLAB function in Björck and Indahl (2017) + +### Arguments + - 'X' - Data matrix of predictors + - 'y' - Vector of responses + - 'A' - Number of components + +### Returns: + - beta - Matrix with regression coefficients (including constant term if centre==true) + - W, T, B - Matrices such that X \approx T*B*W'. + B in compact form, use following to create full: B2 = zeros(A,A); B2[diagind(B2)] = B[:,1]; B2[diagind(B2,1)] = B[2:end,2]; +""" +function bidiag2(X, y, A, centre=true) + +n, p = size(X); + +w = zeros(p, 1); +W = zeros(p, A); +t = zeros(n, 1); +T = zeros(n, A); +P = zeros(p, A); +beta = zeros(p, A); +B = zeros(A,2); + +mX = mean(X, dims=1); +my = mean(y, dims=1); +y = y .- my; + +if centre + X = X .- mX; +end + +w = X'y; +w = w / norm(w); +W[:,1] = w; +t = X*w; +rho = norm(t); +t = t / rho; +T[:,1] = t; + +B[1,1] = rho; +d = w / rho; +beta[:,1] = (t'y) .* d; + +for a in 2:A + w = X'*t - rho * w; + w = w - W*(W'*w); + theta = norm(w); + w = w / theta; + W[:,a] = w; + t = X * w - theta * t; + t = t - T*(T'*t); + rho = norm(t); + t = t / rho; + T[:,a] = t; + B[a-1, 2] = theta; + B[a, 1] = rho; + d = (w - theta*d) / rho; + beta[:,a] = beta[:,a-1] + (t'y) .* d; +end + +if centre + b0 = my .- mX * beta + beta = [b0; beta]; +end + +returnvals = beta, W, T, B +return returnvals end \ No newline at end of file