Initial Commit

2023-05-10 10:18:58 +02:00 · 2023-05-10 10:18:58 +02:00 · 974b59dcc3
commit 974b59dcc3
parent 2387599b24
5 changed files with 876 additions and 2 deletions
--- a/Project.toml
+++ b/Project.toml
@ -3,6 +3,16 @@ uuid = "be803360-cecc-4859-8120-03d0223bb960"
 authors = ["Joakim"]
 version = "1.0.0-DEV"
 [deps]
 LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationBBO = "3e6eede4-6085-4f62-9a71-46d9bc1eb92b"
 OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 [compat]
 julia = "1"
--- a/src/MinPakke.jl
+++ b/src/MinPakke.jl
@ -1,5 +1,30 @@
 module MinPakke
-# Write your package code here.
+using LinearAlgebra
 using Statistics
 using Plots
 using MAT
 using Optimization
 using OptimizationOptimJL
 using OptimizationBBO
 using LaTeXStrings
 export plegendre
 export SNV
 export MSC
 export EMSCTraditional
 export savitzkyGolay
 export findBaseline
 export EMSC
 export EMSCCorrection
 export plotr
 export compPlot
 include("preprocessing.jl")
 include("convenience.jl")
 include("conveniencePlots.jl")
 end
--- a/src/convenience.jl
+++ b/src/convenience.jl
@ -0,0 +1,303 @@
 #=
 X, Y = importData("spectra");
 splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42);
 split = createDataSplit(splits, nTrainValTest, 5, X, Y);
 size(split["XTrain"])
 ###
 X, G             = importData("OVALung");
 splitStrat, nTVT = createDataSplitBinaryStratified(G, 5);
 my_split         = createDataSplit(splitStrat, nTVT, 2, X, G);
 XVal             = my_split["XVal"];
 =#
 """
    createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
    createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
 Creates training/validation/test split for dataset
 ### Arguments
    - 'X'       : Data matrix or integer indicating number of samples
    - 'nSplits' : Int, number of data splits to create
    - 'props'   : 3x1 Vector of proportions (train/val/test), must sum to 1
    - 'rngseed' : Int, for reproducibility
 ### Returns
    - 'splits'        : size(X,1) x nSplits matrix with indices
    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
 """
 function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
 n             = size(X, 1);
 nTrainValTest = zeros(3);
 splits        = convert(Matrix{Int64}, zeros(n, nSplits));
 if abs(sum(props) - 1) > 1e-14
    error("Error! Proportions do not add up to 1");
 end
 nTrain = convert(Int64, floor(n * props[1]));
 nVal   = convert(Int64, floor(n * props[2]));
 nTest  = convert(Int64, floor(n * props[3]));
 nLeft  = n - (nTrain + nVal + nTest);
 nTrain = nTrain + nLeft
 Random.seed!(rngseed);
 for i in 1:nSplits
    inds = randperm(n);
    splits[1:nTrain                , i] = inds[1:nTrain];
    splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
    splits[nTrain+nVal+1:end       , i] = inds[nTrain+nVal+1 : end];
 end
 nTrainValTest = [nTrain, nVal, nTest];
 return splits, nTrainValTest
 end
 """
    createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
 Creates training/validation/test split for dataset
 ### Arguments
    - 'X'       : Int, number of samples
    - 'props'   : 3x1 Vector of proportions (train/val/test), must sum to 1
    - 'nSplits' : Int, number of data splits
    - 'rngseed' : Int, for reproducibility
 ### Returns
    - 'splits'        : size(X,1) x nSplits matrix with indices
    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
 """
 function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
 n             = X;
 nTrainValTest = zeros(3);
 splits        = convert(Matrix{Int64}, zeros(n, nSplits));
 if abs(sum(props) - 1) > 1e-14
    error("Error! Proportions do not add up to 1");
 end
 nTrain = convert(Int64, floor(n * props[1]));
 nVal   = convert(Int64, floor(n * props[2]));
 nTest  = convert(Int64, floor(n * props[3]));
 nLeft  = n - (nTrain + nVal + nTest);
 nTrain = nTrain + nLeft
 Random.seed!(rngseed);
 for i in 1:nSplits
    inds = randperm(n);
    splits[1:nTrain                , i] = inds[1:nTrain];
    splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
    splits[nTrain+nVal+1:end       , i] = inds[nTrain+nVal+1 : end];
 end
 nTrainValTest = [nTrain, nVal, nTest];
 return splits, nTrainValTest
 end
 """
    createDataSplit(splits, nTrainValTest, splitInd, X, Y)
 Creates training/validation/test split for dataset
 ### Arguments
    - 'splits'        : size(X,1) x nSplits matrix with indices
    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
    - 'splitInd'      : Index of split to use
    - 'X' and 'Y'     : Data matrix and response vector/matrix
 ### Returns
    - 'splits'        : size(X,1) x nSplits matrix with indices
    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
 """
 function createDataSplit(splits, nTrainValTest, splitInd, X, Y)
 XTrain = X[splits[1                                     : nTrainValTest[1], splitInd], :];
 XVal   = X[splits[nTrainValTest[1] + 1                  : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
 XTest  = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
 YTrain = Y[splits[1                                     : nTrainValTest[1], splitInd], :];
 YVal   = Y[splits[nTrainValTest[1] + 1                  : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
 YTest  = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
 return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]);
 end
 """
    createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
 Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2
 ### Arguments
    - 'G'       : Vector{Int64}, group membership coded as 1, 2
    - 'nSplits' : Int, number of data splits
    - 'props'   : 3x1 Vector of proportions (train/val/test), must sum to 1
    - 'rngseed' : Int, for reproducibility
 ### Returns
    - 'splits'        : size(X,1) x nSplits matrix with indices
    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
 """
 function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
 n      = length(G);
 splits = convert(Matrix{Int64}, zeros(n, nSplits));
 indsG1 = findall( @. abs(G - 1.0) < 1e-14)
 indsG2 = findall( @. abs(G - 2.0) < 1e-14)
 nG1    = length(indsG1);
 nG2    = length(indsG2);
 propG1 = nG1 / n
 propG2 = 1 - propG1;
 nTrain1 = convert(Int64, floor(n * props[1] * propG1));
 nVal1   = convert(Int64, floor(n * props[2] * propG1));
 nTest1  = convert(Int64, floor(n * props[3] * propG1));
 nTrain2 = convert(Int64, floor(n * props[1] * propG2));
 nVal2   = convert(Int64, floor(n * props[2] * propG2));
 nTest2  = convert(Int64, floor(n * props[3] * propG2));
 # Unassigned samples (if any) are added to the training set)
 nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1);
 nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2);
 nTrain1 = nTrain1 + nLeftG1;
 nTrain2 = nTrain2 + nLeftG2;
 nTrain = nTrain1 + nTrain2;
 nVal   = nVal1   + nVal2;
 nTest  = nTest1  + nTest2;
 if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n
    error("Error! Some samples are not assigned!");
 end
 Random.seed!(rngseed);
 for i in 1:nSplits    
    G1IndsShuffle = randperm(nG1);
    G2IndsShuffle = randperm(nG2);
    splits[1 : nTrain              , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]];
    splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]];
    splits[nTrain+nVal+1 : end     , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60
 end
 nTrainValTest = [nTrain, nVal, nTest];
 return splits, nTrainValTest
 end
 """
    importData(datasetName, datasetPath="/home/jovyan/Datasets/")
 Example: X, y, waves = importData("Beer");
 Returns X, Y/G, [waves], +more for some datasets
 Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb, 
 leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original, 
 Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11
 """
 function importData(datasetName, datasetPath="/home/jovyan/Datasets/")
 #datasetName = "Beer"
 #datasetPath = "/home/jovyan/Datasets/"
 vars = matread(string(datasetPath,datasetName,".mat"));
 if datasetName == "9tumors"
    X = vars["X"];
    G = vars["G"];
 elseif datasetName == "bacteria_K4_K5_K6"
    # Causes crash for some reason
 elseif datasetName == "Beer"
    X = vars["X"];
    Y = vars["Y"]; # G for some datasets
    waves = 400:2:2250;
    return X, Y, waves
 elseif datasetName == "Dough"
    X = [vars["Xtrain"]; vars["Xtest"]];
    Y = [vars["Ytrain"]; vars["Ytest"]];
    #if response != "all"
    #    Y = Y[:,response];
    #    Y = dropdims(Y;dims=2);
    #end
    return X, Y
 elseif datasetName == "FTIR_AMW_tidied"
    X = vars["X"];
    Y = vars["Y"];
 elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP"
 elseif datasetName == "HPLCforweb"
    X = vars["HPLCforweb"]["data"]
 elseif datasetName == "leukemia1"
    X = vars["X"];
    G = vars["G"];
 elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact"
 elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW!
    X = vars["Ramandata"];
    Y = vars["PUFAdata"];
    waves = vars["Wavelengths_Raman"];
    return X, Y, waves
 elseif datasetName == "NMR_WineData"
    X = vars["X"]
    Y = vars["Y"];
    ppm = vars["ppm"];  
    return X, Y, ppm
 elseif datasetName == "onion_NMR"
    X = vars["x"]
    Y = vars["onion"];
    ppm = vars["ppm"];
    return X, Y, ppm
 elseif datasetName == "OVALung"
    X = vars["X"];
    G = vars["y"]; 
    return X, G
 elseif datasetName == "Raman_Adipose_fat_pig_original"
    X = vars["spectra"]
    Y = vars["fat"]
    baseline = vars["baseline"]
    waves = parse.(Float64,vars["wavelength"])
    return X, Y, waves, baseline
 elseif datasetName == "Ramanmilk"
    # There are replicates that need to be handled
    #X = vars["SpectraR"]
    #Y = vars["CLA"]
 elseif datasetName == "RAMANPorkFat"
    X = vars["X"]["data"]
    Y = vars["Y"]["data"]
    return X, Y
 elseif datasetName == "spectra"
    X = vars["NIR"];
    Y = vars["octane"];
    waves = 900:2:1700;    
    return X, Y, waves
 elseif datasetName == "Sugar"
    X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]];
    waves = vars["wave"];    
    return X, Y, waves
 elseif datasetName == "tumor11"
    X = vars["X"];
    G = vars["G"];    
 end
 end
--- a/src/conveniencePlots.jl
+++ b/src/conveniencePlots.jl
@ -0,0 +1,148 @@
 ### TO-DO: Add support for arbitrary number of plots
 #=
 X, _, waves = importData("Raman_Adipose_fat_plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
 pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);ig_original");
 X_SNV = SNV(X);
 X_EMSC = EMSC(X,6);
 plotr(waves, X, X_EMSC);
 plotr(waves, X);
 =#
 function plotr(waves, X::Matrix{Float64})
 plt = plot(waves, X', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 display(plt);
 #return plt;
 end
 function plotr(waves, X1::Matrix{Float64}, X2::Matrix{Float64})
 plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function plotr(waves, X)
 plt = plot(waves, X["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 display(plt);
 #return plt;
 end
 function plotr(waves, X1, X2)
 plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function plotr(waves, X1::Matrix{Float64}, X2)
 plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function plotr(waves, X1, X2::Matrix{Float64})
 plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1::Matrix{Float64}, X2::Matrix{Float64})
 plta = plot(X1', legend=false, grid=false);
 pltb = plot(X2', legend=false, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1::Vector{Float64}, X2::Vector{Float64})
 plta = plot(X1, legend=false, grid=false);
 pltb = plot(X2, legend=false, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1, X2::Matrix{Float64})
 plta = plot(X1["X_Cor"]', legend=false, grid=false);
 pltb = plot(X2', legend=false, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1::Matrix{Float64}, X2)
 plta = plot(X1', legend=false, grid=false);
 pltb = plot(X2["X_Cor"]', legend=false, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1, X2)
 plta = plot(X1["X_Cor"]', legend=false, grid=false);
 pltb = plot(X2["X_Cor"]', legend=false, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1, X2, X_3)
 plta = plot(X1["X_Cor"]', legend=false, grid=false);
 pltb = plot(X2["X_Cor"]', legend=false, grid=false);
 pltc = plot(X3["X_Cor"]', legend=false, grid=false);
 plt  = plot(plta, pltb, pltc, layout = (3,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1::Matrix{Float64}, X2, X3)
 plta = plot(X1', legend=false, grid=false);
 pltb = plot(X2["X_Cor"]', legend=false, grid=false);
 pltc = plot(X3["X_Cor"]', legend=false, grid=false);
 plt  = plot(plta, pltb, pltc, layout = (3,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1, X2::Matrix{Float64}, X3)
 plta = plot(X1["X_Cor"]', legend=false, grid=false);
 pltb = plot(X2', legend=false, grid=false);
 pltc = plot(X3["X_Cor"]', legend=false, grid=false);
 plt  = plot(plta, pltb, pltc, layout = (3,1));
 display(plt);
 #return plt;
 end
 function compPlot(X1, X2, X3::Matrix{Float64})
 plta = plot(X1["X_Cor"]', legend=false, grid=false);
 pltb = plot(X2["X_Cor"]', legend=false, grid=false);
 pltc = plot(X3', legend=false, grid=false);
 plt  = plot(plta, pltb, pltc, layout = (3,1));
 display(plt);
 #return plt;
 end
--- a/src/preprocessing.jl
+++ b/src/preprocessing.jl
@ -0,0 +1,388 @@
 #=
 X, y    = importData("Raman_Adipose_fat_pig_original"); # RAMANPorkFat
 X_MSC   = MSC(X);
 X_MSCm  = MSC(X, vec(mean(X, dims=1)));
 X_SNV   = SNV(X);
 X_EMSC  = EMSC(X, 6);
 X_EMSCi = EMSC(X, 6, "svd", 1, -1, X[71,:]-X[39,:]); # Only for Raman_Adipose_fat_pig_original
 X_EMSCb = EMSC(X, 6, "svd", 6, 0);
 X_EMSCt = EMSCTraditional(X, 6);
 X_base  = findBaseline(X, 6);
 X_SV1   = savitzkyGolay(X, 6, 5, 2, false);
 # compPlot(X_EMSC, X_EMSCi); # or do the below
 plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
 pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
 plt  = plot(plta, pltb, layout = (2,1));
 display(plt);
 =#
 """
    function plegendre(d, p)
 Calculates orthonormal Legendre polynomials using a QR factorisation.
 Inputs:
    - d : polynomial degree
    - p : size of vector
 Outputs:
    - Q : (d+1) x p matrix with basis
    - R : matrix from QR-factorisation
 """
 function plegendre(d, p)
 P = ones(p, d+1);
 x = (-1:2/(p-1):1)';
 for k in 1:d
    P[:,k+1] = x.^k;
 end
 factorisation = qr(P);
 Q = Matrix(factorisation.Q)';
 R = Matrix(factorisation.R);
 return Q, R
 end
 """
    function SNV(X)
 Standard Normal Variate (SNV) preprocessing (for each sample: subtract mean, divide by standard deviation)
 Returns dictionary with keys X\\_Cor, means, stds.
 """
 function SNV(X)
 X_SNV = zeros(size(X));
 means = mean(X, dims=2);
 stds  = std(X, dims=2);
 X_SNV = @. (X - means) / stds;
 return_values = Dict([("X_Cor", X_SNV), ("means", means), ("stds", stds)]);
 return return_values
 end
 """
    function MSC(X, ref::String="mean"||Vector{Float64})
 MSC preprocessing (subtract constant trend, scale based on projection onto ref. spectrum)
 Second argument is mean (default) or svd (first right singular vector), or a vector to be used as reference spectrum
 Returns dictionary with keys X\\_Cor, X\\_Ref, coeffs
 """
 function MSC(X, ref::String="mean")
 if ref == "mean"
    X_Ref = vec(mean(X, dims=1));
 elseif ref == "svd"
    u, s, v = svd(X, full=false);
    X_Ref   = v[:,1];
    if X_Ref[1] < 0 # Ad-hoc, attempts to correct for "flipped" spectra
        X_Ref = -X_Ref;
    end
 end
 B      = [ones(size(X,2), 1) X_Ref];
 coeffs = B \ X';
 X_MSC  = @. (X - coeffs[1,:]) / coeffs[2,:];
 return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
 return return_values
 end
 function MSC(X, X_Ref::Vector{Float64})
 B     = [ones(size(X,2), 1) X_Ref];
 coeffs = B \ X';
 X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:];
 return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
 return return_values
 end
 """
    function EMSCTraditional(X, polDeg=2)
 EMSC correction with mean spectrum as reference and polynomial trends of form LinRange(-1,1,p).^i.
 First argument is spectra to be corrected, second argument is either degree for polynomial regression
 or a basis to be used for correction (basis used should be output of this function, the reference
 spectrum is assumed to be the first basis vector).
 Returns dictionary with keys X\\_Cor, basis, coeffs
 """
 function EMSCTraditional(X, polDeg::Int64=2)
 n, p    = size(X);
 P       = zeros(p, polDeg+1);
 [P[:,i] = LinRange(-1, 1, p).^(i-1) for i in 1:polDeg+1];
 refSpec = mean(X, dims=1)';
 basis   = [refSpec P];
 return_values = EMSCTraditional(X, basis);
 return return_values
 end
 """
    function EMSCTraditionalCorrection(X, basis)
 EMSC correction according to the given input basis that should be
 the output of the EMSCTraditional function.
 """
 function EMSCTraditional(X, basis::Matrix{Float64})
 coeffs  = basis \ X';
 X_Cor   = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:];
 return_values = Dict([("X_Cor", X_Cor), ("basis", basis), ("coeffs", coeffs)]);
 return return_values
 end
 """
    function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
 SavitzkyGolay filter.
 Inputs:
    X         - data matrix
    d         - polynomial order
    w         - window size parameter (window size = 2*w+1)
    der_order - derivative order
    keep_endpoints - boolean, truncates spectra if false. NOTE: For derivatives MUST USE FALSE
 Output: Dictionary with keys X\\_Cor, der\\_order, degree, window\\_size, filter\\_coeffs, keep\\_endpoints
 """
 function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
 if (d > 2*w+1) | (der_order > d)
        error("Error! Must have d>2*w+1 and der_order > d")
 end
 n, p                = size(X);
 Q, R                = plegendre(d, 2*w+1);
 a                   = factorial(der_order) * (R \ Q)[der_order + 1,:];
 X_Cor               = zeros(n, p);
 X_Cor[:, 1:w]       = X[:, 1:w];
 X_Cor[:, end-w:end] = X[:, end-w:end];
 for i in 1:n
    for j in 1:p-2*w-1
        X_Cor[i,j+w] = X[i, j : j + 2*w]'a;
    end
 end
 if !keep_endpoints
    X_Cor = X_Cor[:, w+1:end];
    X_Cor = X_Cor[:, 1:end-w-1];
 end
 return Dict([("X_Cor", X_Cor), ("der_order", der_order), ("degree", d), ("window_size", w), ("filter_coeffs", a), ("keep_endpoints", keep_endpoints)]);
 end
 """
    function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
    function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
 Finding and correcting baseline using iterative polynomial fits
 Implementation of baseline correcting algorithm in Automated Method for
 Subtraction of Fluorescence from Biological Raman Spectra (Lieber and
 Mahadevan-Jansen) as described in Optimal choice of baseline correction
 for multivariate calibration of spectra (Liland and Mevik).
 NOTE: Changed criterion to while loop to relative error and lowered threshold considerably compared to MATLAB code.
 The first function finds the baseline, the second functions calls the first function repeatedly on all rows in X.
 """
 function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
 p          = length(v);    
 P, _       = plegendre(polDeg, p);
 prevCoeffs = zeros(polDeg+1);
 newCoeffs  = ones(polDeg+1);
 vcur       = v;
 vnew       = copy(v);
 nIter      = 0
 while (norm(P' * (prevCoeffs-newCoeffs))/norm(P'*prevCoeffs) > thresh) && (nIter < maxIter)
 prevCoeffs = newCoeffs;
 newCoeffs  = P' \ vnew;
 vnew       = P' * newCoeffs;
 for i in 1:p
    if vcur[i] < vnew[i]
        vnew[i] = vcur[i];
    end
 end
 nIter = nIter + 1;
 end
 if nIter == maxIter
    println("Warning! Reached maximum number of iterations!");
 end
 baseline = vec(newCoeffs' * P);
 coeffs   = newCoeffs';    
 v_cor    = v - baseline;
 return v_cor, baseline, coeffs
 end
 function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
 n, p     = size(X);
 X_Cor    = zeros(n, p);
 baseline = zeros(n,p);
 coeffs   = zeros(n, polDeg+1);
 for i=1:n
    _, baseline[i,:], coeffs[i,:] = findBaseline(X[i,:], polDeg, maxIter, thresh);
    X_Cor[i,:] = X[i,:] - baseline[i,:];
 end
 return_values = Dict([("baseline", baseline), ("X_Cor", X_Cor)]);
 return return_values
 end
 """
    function EMSC(X, polDeg=2, refType="svd", nRef=1,  baseDeg=-1, intF=0)
 Implementation of EMSC pre-processing based on Skogholt et al. (2018).
 intF is 0 (no interferent), vector, or matrix with intereferents as rows.
 Returns dictionary with keys X\\_Cor, model, coeffs
 """
 function EMSC(X, polDeg=2, refType="svd", nRef=1,  baseDeg=-1, intF=0)
 if intF == 0
    nIntF = 0;
 elseif typeof(intF)==Vector{Float64}
    intF = reshape(intF, 1, length(intF));
    nIntF = 1;
 else
    nIntF = size(intF, 1);
 end  
 n, p    = size(X);
 P, _    = plegendre(polDeg, p);
 refSpec = zeros(nRef, p);
 tot     = polDeg + 1 + nIntF;
 if refType == "svd"
    _, _, v = svd(X, full=false);
    refSpec = v[:,1:nRef];
 elseif refType == "mean"
    refSpec = mean(X, dims=1)';
 end
 # Correcting sign if ref. vectors are flipped. Ad-hoc but appears to work
 # well in practice.
 correlations = cor(X', refSpec);
 for i in 1:nRef
    if sum(correlations[:,i] .< 0) > n/2
        refSpec[i,:] = -refSpec[i,:];
    end
 end
 # Creating basis, need ref. spectra at the end to correct for interferents
 # and polynoial trends.
 if nIntF == 0
    basis, R = qr([P' refSpec]);
    basis    = Matrix(basis)';
 else
    basis, R = qr([P' intF' refSpec]);
    basis    = Matrix(basis)'; 
 end
 # Checking if QR factorisation "flipped" any of the vectors and if so flip them back.
 for i in (tot+1):(tot+nRef)
    if R[i,i] < 0
        R[i,:] = -R[i,:];
        basis[i,:] = -basis[i,:];
    end
 end
 refPolPol = vec( R[1:tot, tot+1]' * basis[1:tot,:] / R[tot+1, tot+1]);
 # Optional baseline correction. Baseline calculated on (first) reference spectrum and same baseline is subtracted
 # from all spectra (to not affect prediction).
 if baseDeg > -1
    _, baseline = findBaseline(refSpec[:,1], baseDeg, 1000, 1e-4);
 else
    baseline = zeros(p);
 end
 model         = EMSCModel(basis, R, Vector([polDeg+1, nIntF, nRef]), refPolPol, baseline);
 return_values = EMSCCorrection(X, model);
 return return_values
 end
 """
 - EMSCCorrection(X, model::EMSCModel)
 Performs EMSC correction on X with given model.
 Returns dictionary with keys X\\_Cor, model, coeffs
 """
 function EMSCCorrection(X, model::EMSCModel)
 n, _ = size(X);
 # Calculate coefficients (utilizing orthogonl basis),
 # Subtract projection onto vectors modelling unwanted additive stuff,
 # Normalise L2-norm of spectra in subspace spanned by reference spectrum
 tot    = sum(model.basisExplanation[1:2]);
 coeffs = X * model.basis';
 mult   = sqrt.(sum(coeffs[:, tot+1:end].^2, dims=2));
 X_Cor  = X - coeffs[:,1:tot] * model.basis[1:tot,:];
 X_Cor  = X_Cor ./ mult;
 # Below is purely for visualisation.
 # The spectra look strange without polynomial trends,
 # so what is done below is adding back the polynomial trends to the reference spectrum.
 # This is done by using the fact that R[1:tot, tot+1] contains the projection of the (first)
 # reference spectrum onto the previous basis vectors. This allows us to easily add back the polynomial trends.
 # As the same vector is added to each spectrum predictions ar eunaffected.
 [X_Cor[i,:] = X_Cor[i,:] - model.refPolPol for i in 1:n];
 [X_Cor[i,:] = X_Cor[i,:] - model.baseline for i in 1:n]; # Surely this should be possible with broadcasting instead...
 # Thought it would be + baseline above, but this works...
 return_values = Dict([("X_Cor", X_Cor), ("model", model), ("coeffs", coeffs)]);
 return return_values
 end