From 974b59dcc31a80201b1f0d59dfc8b85c035a353d Mon Sep 17 00:00:00 2001
From: Joakim Skogholt <onlinereg@skogholt.xyz>
Date: Wed, 10 May 2023 10:18:58 +0200
Subject: [PATCH] Initial Commit

---
 Project.toml            |  10 ++
 src/MinPakke.jl         |  29 ++-
 src/convenience.jl      | 303 +++++++++++++++++++++++++++++++
 src/conveniencePlots.jl | 148 +++++++++++++++
 src/preprocessing.jl    | 388 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 876 insertions(+), 2 deletions(-)
 create mode 100644 src/convenience.jl
 create mode 100644 src/conveniencePlots.jl
 create mode 100644 src/preprocessing.jl

diff --git a/Project.toml b/Project.toml
index 2d3b0e4..6f6cff8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,6 +3,16 @@ uuid = "be803360-cecc-4859-8120-03d0223bb960"
 authors = ["Joakim"]
 version = "1.0.0-DEV"
 
+[deps]
+LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
+Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
+OptimizationBBO = "3e6eede4-6085-4f62-9a71-46d9bc1eb92b"
+OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
 [compat]
 julia = "1"
 
diff --git a/src/MinPakke.jl b/src/MinPakke.jl
index df023d9..8298164 100644
--- a/src/MinPakke.jl
+++ b/src/MinPakke.jl
@@ -1,5 +1,30 @@
 module MinPakke
 
-# Write your package code here.
+using LinearAlgebra
+using Statistics
+using Plots
+using MAT
+using Optimization
+using OptimizationOptimJL
+using OptimizationBBO
+using LaTeXStrings
 
-end
+export plegendre
+export SNV
+export MSC
+export EMSCTraditional
+export savitzkyGolay
+export findBaseline
+export EMSC
+export EMSCCorrection
+
+export plotr
+export compPlot
+
+
+
+include("preprocessing.jl")
+include("convenience.jl")
+include("conveniencePlots.jl")
+
+end
\ No newline at end of file
diff --git a/src/convenience.jl b/src/convenience.jl
new file mode 100644
index 0000000..310b916
--- /dev/null
+++ b/src/convenience.jl
@@ -0,0 +1,303 @@
+#=
+X, Y = importData("spectra");
+splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42);
+split = createDataSplit(splits, nTrainValTest, 5, X, Y);
+size(split["XTrain"])
+
+###
+
+X, G             = importData("OVALung");
+splitStrat, nTVT = createDataSplitBinaryStratified(G, 5);
+my_split         = createDataSplit(splitStrat, nTVT, 2, X, G);
+XVal             = my_split["XVal"];
+=#
+
+
+
+
+
+
+"""
+    createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
+    createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
+
+
+Creates training/validation/test split for dataset
+
+### Arguments
+    - 'X'       : Data matrix or integer indicating number of samples
+    - 'nSplits' : Int, number of data splits to create
+    - 'props'   : 3x1 Vector of proportions (train/val/test), must sum to 1
+    - 'rngseed' : Int, for reproducibility
+
+### Returns
+    - 'splits'        : size(X,1) x nSplits matrix with indices
+    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
+"""
+function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
+
+n             = size(X, 1);
+nTrainValTest = zeros(3);
+splits        = convert(Matrix{Int64}, zeros(n, nSplits));
+
+if abs(sum(props) - 1) > 1e-14
+    error("Error! Proportions do not add up to 1");
+end
+
+nTrain = convert(Int64, floor(n * props[1]));
+nVal   = convert(Int64, floor(n * props[2]));
+nTest  = convert(Int64, floor(n * props[3]));
+nLeft  = n - (nTrain + nVal + nTest);
+nTrain = nTrain + nLeft
+
+Random.seed!(rngseed);
+
+for i in 1:nSplits
+    inds = randperm(n);
+    
+    splits[1:nTrain                , i] = inds[1:nTrain];
+    splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
+    splits[nTrain+nVal+1:end       , i] = inds[nTrain+nVal+1 : end];
+end
+
+nTrainValTest = [nTrain, nVal, nTest];
+    
+return splits, nTrainValTest
+end
+
+"""
+    createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
+
+Creates training/validation/test split for dataset
+
+### Arguments
+    - 'X'       : Int, number of samples
+    - 'props'   : 3x1 Vector of proportions (train/val/test), must sum to 1
+    - 'nSplits' : Int, number of data splits
+    - 'rngseed' : Int, for reproducibility
+
+### Returns
+    - 'splits'        : size(X,1) x nSplits matrix with indices
+    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
+"""
+function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
+
+n             = X;
+nTrainValTest = zeros(3);
+splits        = convert(Matrix{Int64}, zeros(n, nSplits));
+
+if abs(sum(props) - 1) > 1e-14
+    error("Error! Proportions do not add up to 1");
+end
+
+nTrain = convert(Int64, floor(n * props[1]));
+nVal   = convert(Int64, floor(n * props[2]));
+nTest  = convert(Int64, floor(n * props[3]));
+nLeft  = n - (nTrain + nVal + nTest);
+nTrain = nTrain + nLeft
+
+Random.seed!(rngseed);
+
+for i in 1:nSplits
+    inds = randperm(n);
+    
+    splits[1:nTrain                , i] = inds[1:nTrain];
+    splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
+    splits[nTrain+nVal+1:end       , i] = inds[nTrain+nVal+1 : end];
+end
+
+nTrainValTest = [nTrain, nVal, nTest];
+    
+return splits, nTrainValTest
+end
+
+"""
+    createDataSplit(splits, nTrainValTest, splitInd, X, Y)
+
+Creates training/validation/test split for dataset
+
+### Arguments
+    - 'splits'        : size(X,1) x nSplits matrix with indices
+    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
+    - 'splitInd'      : Index of split to use
+    - 'X' and 'Y'     : Data matrix and response vector/matrix
+
+### Returns
+    - 'splits'        : size(X,1) x nSplits matrix with indices
+    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
+"""
+function createDataSplit(splits, nTrainValTest, splitInd, X, Y)
+
+XTrain = X[splits[1                                     : nTrainValTest[1], splitInd], :];
+XVal   = X[splits[nTrainValTest[1] + 1                  : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
+XTest  = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
+            
+YTrain = Y[splits[1                                     : nTrainValTest[1], splitInd], :];
+YVal   = Y[splits[nTrainValTest[1] + 1                  : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
+YTest  = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
+   
+return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]);
+end
+
+"""
+    createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
+
+Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2
+
+### Arguments
+    - 'G'       : Vector{Int64}, group membership coded as 1, 2
+    - 'nSplits' : Int, number of data splits
+    - 'props'   : 3x1 Vector of proportions (train/val/test), must sum to 1
+    - 'rngseed' : Int, for reproducibility
+
+### Returns
+    - 'splits'        : size(X,1) x nSplits matrix with indices
+    - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
+"""
+function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
+
+n      = length(G);
+splits = convert(Matrix{Int64}, zeros(n, nSplits));
+    
+indsG1 = findall( @. abs(G - 1.0) < 1e-14)
+indsG2 = findall( @. abs(G - 2.0) < 1e-14)
+
+nG1    = length(indsG1);
+nG2    = length(indsG2);
+
+propG1 = nG1 / n
+propG2 = 1 - propG1;
+
+nTrain1 = convert(Int64, floor(n * props[1] * propG1));
+nVal1   = convert(Int64, floor(n * props[2] * propG1));
+nTest1  = convert(Int64, floor(n * props[3] * propG1));
+nTrain2 = convert(Int64, floor(n * props[1] * propG2));
+nVal2   = convert(Int64, floor(n * props[2] * propG2));
+nTest2  = convert(Int64, floor(n * props[3] * propG2));
+
+# Unassigned samples (if any) are added to the training set)
+nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1);
+nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2);
+
+nTrain1 = nTrain1 + nLeftG1;
+nTrain2 = nTrain2 + nLeftG2;
+
+nTrain = nTrain1 + nTrain2;
+nVal   = nVal1   + nVal2;
+nTest  = nTest1  + nTest2;
+
+if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n
+    error("Error! Some samples are not assigned!");
+end
+
+Random.seed!(rngseed);
+    
+for i in 1:nSplits    
+    G1IndsShuffle = randperm(nG1);
+    G2IndsShuffle = randperm(nG2);
+    
+    splits[1 : nTrain              , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]];
+    splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]];
+    splits[nTrain+nVal+1 : end     , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60
+end
+    
+nTrainValTest = [nTrain, nVal, nTest];
+    
+return splits, nTrainValTest
+end
+
+
+
+
+"""
+    importData(datasetName, datasetPath="/home/jovyan/Datasets/")
+
+Example: X, y, waves = importData("Beer");
+Returns X, Y/G, [waves], +more for some datasets
+Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb, 
+leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original, 
+Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11
+"""
+function importData(datasetName, datasetPath="/home/jovyan/Datasets/")
+
+#datasetName = "Beer"
+#datasetPath = "/home/jovyan/Datasets/"
+vars = matread(string(datasetPath,datasetName,".mat"));
+
+if datasetName == "9tumors"
+    X = vars["X"];
+    G = vars["G"];
+elseif datasetName == "bacteria_K4_K5_K6"
+    # Causes crash for some reason
+elseif datasetName == "Beer"
+    X = vars["X"];
+    Y = vars["Y"]; # G for some datasets
+    waves = 400:2:2250;
+    return X, Y, waves
+elseif datasetName == "Dough"
+    X = [vars["Xtrain"]; vars["Xtest"]];
+    Y = [vars["Ytrain"]; vars["Ytest"]];
+    #if response != "all"
+    #    Y = Y[:,response];
+    #    Y = dropdims(Y;dims=2);
+    #end
+    return X, Y
+elseif datasetName == "FTIR_AMW_tidied"
+    X = vars["X"];
+    Y = vars["Y"];
+elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP"
+    
+elseif datasetName == "HPLCforweb"
+    X = vars["HPLCforweb"]["data"]
+elseif datasetName == "leukemia1"
+    X = vars["X"];
+    G = vars["G"];
+elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact"
+    
+elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW!
+    X = vars["Ramandata"];
+    Y = vars["PUFAdata"];
+    waves = vars["Wavelengths_Raman"];
+    return X, Y, waves
+elseif datasetName == "NMR_WineData"
+    X = vars["X"]
+    Y = vars["Y"];
+    ppm = vars["ppm"];  
+    return X, Y, ppm
+elseif datasetName == "onion_NMR"
+    X = vars["x"]
+    Y = vars["onion"];
+    ppm = vars["ppm"];
+    return X, Y, ppm
+elseif datasetName == "OVALung"
+    X = vars["X"];
+    G = vars["y"]; 
+    return X, G
+elseif datasetName == "Raman_Adipose_fat_pig_original"
+    X = vars["spectra"]
+    Y = vars["fat"]
+    baseline = vars["baseline"]
+    waves = parse.(Float64,vars["wavelength"])
+    return X, Y, waves, baseline
+elseif datasetName == "Ramanmilk"
+    # There are replicates that need to be handled
+    #X = vars["SpectraR"]
+    #Y = vars["CLA"]
+elseif datasetName == "RAMANPorkFat"
+    X = vars["X"]["data"]
+    Y = vars["Y"]["data"]
+    return X, Y
+elseif datasetName == "spectra"
+    X = vars["NIR"];
+    Y = vars["octane"];
+    waves = 900:2:1700;    
+    return X, Y, waves
+elseif datasetName == "Sugar"
+    X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]];
+    waves = vars["wave"];    
+    return X, Y, waves
+elseif datasetName == "tumor11"
+    X = vars["X"];
+    G = vars["G"];    
+end
+end
\ No newline at end of file
diff --git a/src/conveniencePlots.jl b/src/conveniencePlots.jl
new file mode 100644
index 0000000..228403e
--- /dev/null
+++ b/src/conveniencePlots.jl
@@ -0,0 +1,148 @@
+### TO-DO: Add support for arbitrary number of plots
+
+#=
+X, _, waves = importData("Raman_Adipose_fat_plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
+pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);ig_original");
+X_SNV = SNV(X);
+X_EMSC = EMSC(X,6);
+plotr(waves, X, X_EMSC);
+plotr(waves, X);
+=#
+
+
+function plotr(waves, X::Matrix{Float64})
+
+plt = plot(waves, X', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+display(plt);
+#return plt;
+end
+
+function plotr(waves, X1::Matrix{Float64}, X2::Matrix{Float64})
+
+plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function plotr(waves, X)
+    
+plt = plot(waves, X["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+display(plt);
+#return plt;
+end
+
+function plotr(waves, X1, X2)
+    
+plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function plotr(waves, X1::Matrix{Float64}, X2)
+    
+plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function plotr(waves, X1, X2::Matrix{Float64})
+    
+plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1::Matrix{Float64}, X2::Matrix{Float64})
+    
+plta = plot(X1', legend=false, grid=false);
+pltb = plot(X2', legend=false, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1::Vector{Float64}, X2::Vector{Float64})
+    
+plta = plot(X1, legend=false, grid=false);
+pltb = plot(X2, legend=false, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1, X2::Matrix{Float64})
+    
+plta = plot(X1["X_Cor"]', legend=false, grid=false);
+pltb = plot(X2', legend=false, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1::Matrix{Float64}, X2)
+    
+plta = plot(X1', legend=false, grid=false);
+pltb = plot(X2["X_Cor"]', legend=false, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1, X2)
+    
+plta = plot(X1["X_Cor"]', legend=false, grid=false);
+pltb = plot(X2["X_Cor"]', legend=false, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1, X2, X_3)
+    
+plta = plot(X1["X_Cor"]', legend=false, grid=false);
+pltb = plot(X2["X_Cor"]', legend=false, grid=false);
+pltc = plot(X3["X_Cor"]', legend=false, grid=false);
+plt  = plot(plta, pltb, pltc, layout = (3,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1::Matrix{Float64}, X2, X3)
+    
+plta = plot(X1', legend=false, grid=false);
+pltb = plot(X2["X_Cor"]', legend=false, grid=false);
+pltc = plot(X3["X_Cor"]', legend=false, grid=false);
+plt  = plot(plta, pltb, pltc, layout = (3,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1, X2::Matrix{Float64}, X3)
+    
+plta = plot(X1["X_Cor"]', legend=false, grid=false);
+pltb = plot(X2', legend=false, grid=false);
+pltc = plot(X3["X_Cor"]', legend=false, grid=false);
+plt  = plot(plta, pltb, pltc, layout = (3,1));
+display(plt);
+#return plt;
+end
+
+function compPlot(X1, X2, X3::Matrix{Float64})
+    
+plta = plot(X1["X_Cor"]', legend=false, grid=false);
+pltb = plot(X2["X_Cor"]', legend=false, grid=false);
+pltc = plot(X3', legend=false, grid=false);
+plt  = plot(plta, pltb, pltc, layout = (3,1));
+display(plt);
+#return plt;
+end
\ No newline at end of file
diff --git a/src/preprocessing.jl b/src/preprocessing.jl
new file mode 100644
index 0000000..091604d
--- /dev/null
+++ b/src/preprocessing.jl
@@ -0,0 +1,388 @@
+#=
+X, y    = importData("Raman_Adipose_fat_pig_original"); # RAMANPorkFat
+X_MSC   = MSC(X);
+X_MSCm  = MSC(X, vec(mean(X, dims=1)));
+X_SNV   = SNV(X);
+X_EMSC  = EMSC(X, 6);
+X_EMSCi = EMSC(X, 6, "svd", 1, -1, X[71,:]-X[39,:]); # Only for Raman_Adipose_fat_pig_original
+X_EMSCb = EMSC(X, 6, "svd", 6, 0);
+X_EMSCt = EMSCTraditional(X, 6);
+X_base  = findBaseline(X, 6);
+X_SV1   = savitzkyGolay(X, 6, 5, 2, false);
+
+# compPlot(X_EMSC, X_EMSCi); # or do the below
+plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
+pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
+plt  = plot(plta, pltb, layout = (2,1));
+display(plt);
+=#
+
+
+"""
+    function plegendre(d, p)
+
+Calculates orthonormal Legendre polynomials using a QR factorisation.
+Inputs:
+    - d : polynomial degree
+    - p : size of vector
+
+Outputs:
+    - Q : (d+1) x p matrix with basis
+    - R : matrix from QR-factorisation
+"""
+
+function plegendre(d, p)
+ 
+P = ones(p, d+1);
+x = (-1:2/(p-1):1)';
+for k in 1:d
+    P[:,k+1] = x.^k;
+end
+
+factorisation = qr(P);
+Q = Matrix(factorisation.Q)';
+R = Matrix(factorisation.R);
+
+return Q, R
+end
+
+
+
+"""
+    function SNV(X)
+
+Standard Normal Variate (SNV) preprocessing (for each sample: subtract mean, divide by standard deviation)
+Returns dictionary with keys X\\_Cor, means, stds.
+"""
+function SNV(X)
+
+X_SNV = zeros(size(X));
+means = mean(X, dims=2);
+stds  = std(X, dims=2);
+X_SNV = @. (X - means) / stds;
+
+return_values = Dict([("X_Cor", X_SNV), ("means", means), ("stds", stds)]);
+return return_values
+end
+
+
+
+
+
+
+"""
+    function MSC(X, ref::String="mean"||Vector{Float64})
+
+MSC preprocessing (subtract constant trend, scale based on projection onto ref. spectrum)
+Second argument is mean (default) or svd (first right singular vector), or a vector to be used as reference spectrum
+Returns dictionary with keys X\\_Cor, X\\_Ref, coeffs
+"""
+function MSC(X, ref::String="mean")
+  
+if ref == "mean"
+    X_Ref = vec(mean(X, dims=1));
+elseif ref == "svd"
+    u, s, v = svd(X, full=false);
+    X_Ref   = v[:,1];
+    if X_Ref[1] < 0 # Ad-hoc, attempts to correct for "flipped" spectra
+        X_Ref = -X_Ref;
+    end
+end
+
+B      = [ones(size(X,2), 1) X_Ref];
+coeffs = B \ X';
+X_MSC  = @. (X - coeffs[1,:]) / coeffs[2,:];
+
+return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
+return return_values
+end
+
+
+
+function MSC(X, X_Ref::Vector{Float64})
+
+B     = [ones(size(X,2), 1) X_Ref];
+coeffs = B \ X';
+X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:];
+
+return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
+return return_values
+end
+
+
+
+
+"""
+    function EMSCTraditional(X, polDeg=2)
+
+EMSC correction with mean spectrum as reference and polynomial trends of form LinRange(-1,1,p).^i.
+First argument is spectra to be corrected, second argument is either degree for polynomial regression
+or a basis to be used for correction (basis used should be output of this function, the reference
+spectrum is assumed to be the first basis vector).
+Returns dictionary with keys X\\_Cor, basis, coeffs
+"""
+function EMSCTraditional(X, polDeg::Int64=2)
+
+n, p    = size(X);
+P       = zeros(p, polDeg+1);
+[P[:,i] = LinRange(-1, 1, p).^(i-1) for i in 1:polDeg+1];
+refSpec = mean(X, dims=1)';
+basis   = [refSpec P];
+
+return_values = EMSCTraditional(X, basis);
+return return_values
+end
+
+
+"""
+    function EMSCTraditionalCorrection(X, basis)
+
+EMSC correction according to the given input basis that should be
+the output of the EMSCTraditional function.
+"""
+
+function EMSCTraditional(X, basis::Matrix{Float64})
+    
+coeffs  = basis \ X';
+X_Cor   = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:];
+    
+return_values = Dict([("X_Cor", X_Cor), ("basis", basis), ("coeffs", coeffs)]);
+return return_values
+end
+
+
+
+
+
+
+"""
+    function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
+
+SavitzkyGolay filter.
+Inputs:
+    X         - data matrix
+    d         - polynomial order
+    w         - window size parameter (window size = 2*w+1)
+    der_order - derivative order
+    keep_endpoints - boolean, truncates spectra if false. NOTE: For derivatives MUST USE FALSE
+
+Output: Dictionary with keys X\\_Cor, der\\_order, degree, window\\_size, filter\\_coeffs, keep\\_endpoints
+"""
+function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
+
+if (d > 2*w+1) | (der_order > d)
+        error("Error! Must have d>2*w+1 and der_order > d")
+end
+     
+n, p                = size(X);
+Q, R                = plegendre(d, 2*w+1);
+a                   = factorial(der_order) * (R \ Q)[der_order + 1,:];
+X_Cor               = zeros(n, p);
+X_Cor[:, 1:w]       = X[:, 1:w];
+X_Cor[:, end-w:end] = X[:, end-w:end];
+
+for i in 1:n
+    for j in 1:p-2*w-1
+        X_Cor[i,j+w] = X[i, j : j + 2*w]'a;
+    end
+end
+
+if !keep_endpoints
+    X_Cor = X_Cor[:, w+1:end];
+    X_Cor = X_Cor[:, 1:end-w-1];
+end
+
+return Dict([("X_Cor", X_Cor), ("der_order", der_order), ("degree", d), ("window_size", w), ("filter_coeffs", a), ("keep_endpoints", keep_endpoints)]);
+end
+
+
+
+
+
+
+
+
+
+
+
+"""
+    function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
+    function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
+
+Finding and correcting baseline using iterative polynomial fits
+Implementation of baseline correcting algorithm in Automated Method for
+Subtraction of Fluorescence from Biological Raman Spectra (Lieber and
+Mahadevan-Jansen) as described in Optimal choice of baseline correction
+for multivariate calibration of spectra (Liland and Mevik).
+NOTE: Changed criterion to while loop to relative error and lowered threshold considerably compared to MATLAB code.
+The first function finds the baseline, the second functions calls the first function repeatedly on all rows in X.
+"""
+function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
+p          = length(v);    
+P, _       = plegendre(polDeg, p);
+prevCoeffs = zeros(polDeg+1);
+newCoeffs  = ones(polDeg+1);
+vcur       = v;
+vnew       = copy(v);
+nIter      = 0
+
+while (norm(P' * (prevCoeffs-newCoeffs))/norm(P'*prevCoeffs) > thresh) && (nIter < maxIter)
+
+prevCoeffs = newCoeffs;
+newCoeffs  = P' \ vnew;
+vnew       = P' * newCoeffs;
+
+for i in 1:p
+    if vcur[i] < vnew[i]
+        vnew[i] = vcur[i];
+    end
+end
+
+nIter = nIter + 1;
+end
+
+if nIter == maxIter
+    println("Warning! Reached maximum number of iterations!");
+end
+    
+baseline = vec(newCoeffs' * P);
+coeffs   = newCoeffs';    
+v_cor    = v - baseline;
+    
+return v_cor, baseline, coeffs
+end
+
+
+
+function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
+
+n, p     = size(X);
+X_Cor    = zeros(n, p);
+baseline = zeros(n,p);
+coeffs   = zeros(n, polDeg+1);
+
+for i=1:n
+    _, baseline[i,:], coeffs[i,:] = findBaseline(X[i,:], polDeg, maxIter, thresh);
+    X_Cor[i,:] = X[i,:] - baseline[i,:];
+end
+
+return_values = Dict([("baseline", baseline), ("X_Cor", X_Cor)]);
+return return_values
+end
+
+
+
+
+
+
+
+
+
+
+"""
+    function EMSC(X, polDeg=2, refType="svd", nRef=1,  baseDeg=-1, intF=0)
+
+Implementation of EMSC pre-processing based on Skogholt et al. (2018).
+intF is 0 (no interferent), vector, or matrix with intereferents as rows.
+Returns dictionary with keys X\\_Cor, model, coeffs
+"""
+function EMSC(X, polDeg=2, refType="svd", nRef=1,  baseDeg=-1, intF=0)
+
+if intF == 0
+    nIntF = 0;
+elseif typeof(intF)==Vector{Float64}
+    intF = reshape(intF, 1, length(intF));
+    nIntF = 1;
+else
+    nIntF = size(intF, 1);
+end  
+    
+n, p    = size(X);
+P, _    = plegendre(polDeg, p);
+refSpec = zeros(nRef, p);
+tot     = polDeg + 1 + nIntF;
+
+if refType == "svd"
+    _, _, v = svd(X, full=false);
+    refSpec = v[:,1:nRef];
+elseif refType == "mean"
+    refSpec = mean(X, dims=1)';
+end
+
+# Correcting sign if ref. vectors are flipped. Ad-hoc but appears to work
+# well in practice.
+correlations = cor(X', refSpec);
+for i in 1:nRef
+    if sum(correlations[:,i] .< 0) > n/2
+        refSpec[i,:] = -refSpec[i,:];
+    end
+end
+
+# Creating basis, need ref. spectra at the end to correct for interferents
+# and polynoial trends.
+if nIntF == 0
+    basis, R = qr([P' refSpec]);
+    basis    = Matrix(basis)';
+else
+    basis, R = qr([P' intF' refSpec]);
+    basis    = Matrix(basis)'; 
+end
+    
+# Checking if QR factorisation "flipped" any of the vectors and if so flip them back.
+for i in (tot+1):(tot+nRef)
+    if R[i,i] < 0
+        R[i,:] = -R[i,:];
+        basis[i,:] = -basis[i,:];
+    end
+end
+    
+refPolPol = vec( R[1:tot, tot+1]' * basis[1:tot,:] / R[tot+1, tot+1]);
+    
+# Optional baseline correction. Baseline calculated on (first) reference spectrum and same baseline is subtracted
+# from all spectra (to not affect prediction).
+if baseDeg > -1
+    _, baseline = findBaseline(refSpec[:,1], baseDeg, 1000, 1e-4);
+else
+    baseline = zeros(p);
+end
+
+model         = EMSCModel(basis, R, Vector([polDeg+1, nIntF, nRef]), refPolPol, baseline);
+return_values = EMSCCorrection(X, model);
+    
+return return_values
+end
+
+
+
+"""
+ - EMSCCorrection(X, model::EMSCModel)
+
+Performs EMSC correction on X with given model.
+Returns dictionary with keys X\\_Cor, model, coeffs
+"""
+function EMSCCorrection(X, model::EMSCModel)
+
+n, _ = size(X);
+    
+# Calculate coefficients (utilizing orthogonl basis),
+# Subtract projection onto vectors modelling unwanted additive stuff,
+# Normalise L2-norm of spectra in subspace spanned by reference spectrum
+tot    = sum(model.basisExplanation[1:2]);
+coeffs = X * model.basis';
+mult   = sqrt.(sum(coeffs[:, tot+1:end].^2, dims=2));
+X_Cor  = X - coeffs[:,1:tot] * model.basis[1:tot,:];
+X_Cor  = X_Cor ./ mult;
+
+# Below is purely for visualisation.
+# The spectra look strange without polynomial trends,
+# so what is done below is adding back the polynomial trends to the reference spectrum.
+# This is done by using the fact that R[1:tot, tot+1] contains the projection of the (first)
+# reference spectrum onto the previous basis vectors. This allows us to easily add back the polynomial trends.
+# As the same vector is added to each spectrum predictions ar eunaffected.
+[X_Cor[i,:] = X_Cor[i,:] - model.refPolPol for i in 1:n];
+[X_Cor[i,:] = X_Cor[i,:] - model.baseline for i in 1:n]; # Surely this should be possible with broadcasting instead...
+# Thought it would be + baseline above, but this works...
+    
+return_values = Dict([("X_Cor", X_Cor), ("model", model), ("coeffs", coeffs)]);
+return return_values
+end
\ No newline at end of file