From 974b59dcc31a80201b1f0d59dfc8b85c035a353d Mon Sep 17 00:00:00 2001 From: Joakim Skogholt Date: Wed, 10 May 2023 10:18:58 +0200 Subject: [PATCH] Initial Commit --- Project.toml | 10 ++ src/MinPakke.jl | 29 ++- src/convenience.jl | 303 +++++++++++++++++++++++++++++++ src/conveniencePlots.jl | 148 +++++++++++++++ src/preprocessing.jl | 388 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 876 insertions(+), 2 deletions(-) create mode 100644 src/convenience.jl create mode 100644 src/conveniencePlots.jl create mode 100644 src/preprocessing.jl diff --git a/Project.toml b/Project.toml index 2d3b0e4..6f6cff8 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,16 @@ uuid = "be803360-cecc-4859-8120-03d0223bb960" authors = ["Joakim"] version = "1.0.0-DEV" +[deps] +LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MAT = "23992714-dd62-5051-b70f-ba57cb901cac" +Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" +OptimizationBBO = "3e6eede4-6085-4f62-9a71-46d9bc1eb92b" +OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + [compat] julia = "1" diff --git a/src/MinPakke.jl b/src/MinPakke.jl index df023d9..8298164 100644 --- a/src/MinPakke.jl +++ b/src/MinPakke.jl @@ -1,5 +1,30 @@ module MinPakke -# Write your package code here. +using LinearAlgebra +using Statistics +using Plots +using MAT +using Optimization +using OptimizationOptimJL +using OptimizationBBO +using LaTeXStrings -end +export plegendre +export SNV +export MSC +export EMSCTraditional +export savitzkyGolay +export findBaseline +export EMSC +export EMSCCorrection + +export plotr +export compPlot + + + +include("preprocessing.jl") +include("convenience.jl") +include("conveniencePlots.jl") + +end \ No newline at end of file diff --git a/src/convenience.jl b/src/convenience.jl new file mode 100644 index 0000000..310b916 --- /dev/null +++ b/src/convenience.jl @@ -0,0 +1,303 @@ +#= +X, Y = importData("spectra"); +splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42); +split = createDataSplit(splits, nTrainValTest, 5, X, Y); +size(split["XTrain"]) + +### + +X, G = importData("OVALung"); +splitStrat, nTVT = createDataSplitBinaryStratified(G, 5); +my_split = createDataSplit(splitStrat, nTVT, 2, X, G); +XVal = my_split["XVal"]; +=# + + + + + + +""" + createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) + createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) + + +Creates training/validation/test split for dataset + +### Arguments + - 'X' : Data matrix or integer indicating number of samples + - 'nSplits' : Int, number of data splits to create + - 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1 + - 'rngseed' : Int, for reproducibility + +### Returns + - 'splits' : size(X,1) x nSplits matrix with indices + - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing +""" +function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) + +n = size(X, 1); +nTrainValTest = zeros(3); +splits = convert(Matrix{Int64}, zeros(n, nSplits)); + +if abs(sum(props) - 1) > 1e-14 + error("Error! Proportions do not add up to 1"); +end + +nTrain = convert(Int64, floor(n * props[1])); +nVal = convert(Int64, floor(n * props[2])); +nTest = convert(Int64, floor(n * props[3])); +nLeft = n - (nTrain + nVal + nTest); +nTrain = nTrain + nLeft + +Random.seed!(rngseed); + +for i in 1:nSplits + inds = randperm(n); + + splits[1:nTrain , i] = inds[1:nTrain]; + splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal]; + splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end]; +end + +nTrainValTest = [nTrain, nVal, nTest]; + +return splits, nTrainValTest +end + +""" + createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) + +Creates training/validation/test split for dataset + +### Arguments + - 'X' : Int, number of samples + - 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1 + - 'nSplits' : Int, number of data splits + - 'rngseed' : Int, for reproducibility + +### Returns + - 'splits' : size(X,1) x nSplits matrix with indices + - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing +""" +function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) + +n = X; +nTrainValTest = zeros(3); +splits = convert(Matrix{Int64}, zeros(n, nSplits)); + +if abs(sum(props) - 1) > 1e-14 + error("Error! Proportions do not add up to 1"); +end + +nTrain = convert(Int64, floor(n * props[1])); +nVal = convert(Int64, floor(n * props[2])); +nTest = convert(Int64, floor(n * props[3])); +nLeft = n - (nTrain + nVal + nTest); +nTrain = nTrain + nLeft + +Random.seed!(rngseed); + +for i in 1:nSplits + inds = randperm(n); + + splits[1:nTrain , i] = inds[1:nTrain]; + splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal]; + splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end]; +end + +nTrainValTest = [nTrain, nVal, nTest]; + +return splits, nTrainValTest +end + +""" + createDataSplit(splits, nTrainValTest, splitInd, X, Y) + +Creates training/validation/test split for dataset + +### Arguments + - 'splits' : size(X,1) x nSplits matrix with indices + - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing + - 'splitInd' : Index of split to use + - 'X' and 'Y' : Data matrix and response vector/matrix + +### Returns + - 'splits' : size(X,1) x nSplits matrix with indices + - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing +""" +function createDataSplit(splits, nTrainValTest, splitInd, X, Y) + +XTrain = X[splits[1 : nTrainValTest[1], splitInd], :]; +XVal = X[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :]; +XTest = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :]; + +YTrain = Y[splits[1 : nTrainValTest[1], splitInd], :]; +YVal = Y[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :]; +YTest = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :]; + +return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]); +end + +""" + createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) + +Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2 + +### Arguments + - 'G' : Vector{Int64}, group membership coded as 1, 2 + - 'nSplits' : Int, number of data splits + - 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1 + - 'rngseed' : Int, for reproducibility + +### Returns + - 'splits' : size(X,1) x nSplits matrix with indices + - 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing +""" +function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42) + +n = length(G); +splits = convert(Matrix{Int64}, zeros(n, nSplits)); + +indsG1 = findall( @. abs(G - 1.0) < 1e-14) +indsG2 = findall( @. abs(G - 2.0) < 1e-14) + +nG1 = length(indsG1); +nG2 = length(indsG2); + +propG1 = nG1 / n +propG2 = 1 - propG1; + +nTrain1 = convert(Int64, floor(n * props[1] * propG1)); +nVal1 = convert(Int64, floor(n * props[2] * propG1)); +nTest1 = convert(Int64, floor(n * props[3] * propG1)); +nTrain2 = convert(Int64, floor(n * props[1] * propG2)); +nVal2 = convert(Int64, floor(n * props[2] * propG2)); +nTest2 = convert(Int64, floor(n * props[3] * propG2)); + +# Unassigned samples (if any) are added to the training set) +nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1); +nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2); + +nTrain1 = nTrain1 + nLeftG1; +nTrain2 = nTrain2 + nLeftG2; + +nTrain = nTrain1 + nTrain2; +nVal = nVal1 + nVal2; +nTest = nTest1 + nTest2; + +if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n + error("Error! Some samples are not assigned!"); +end + +Random.seed!(rngseed); + +for i in 1:nSplits + G1IndsShuffle = randperm(nG1); + G2IndsShuffle = randperm(nG2); + + splits[1 : nTrain , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]]; + splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]]; + splits[nTrain+nVal+1 : end , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60 +end + +nTrainValTest = [nTrain, nVal, nTest]; + +return splits, nTrainValTest +end + + + + +""" + importData(datasetName, datasetPath="/home/jovyan/Datasets/") + +Example: X, y, waves = importData("Beer"); +Returns X, Y/G, [waves], +more for some datasets +Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb, +leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original, +Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11 +""" +function importData(datasetName, datasetPath="/home/jovyan/Datasets/") + +#datasetName = "Beer" +#datasetPath = "/home/jovyan/Datasets/" +vars = matread(string(datasetPath,datasetName,".mat")); + +if datasetName == "9tumors" + X = vars["X"]; + G = vars["G"]; +elseif datasetName == "bacteria_K4_K5_K6" + # Causes crash for some reason +elseif datasetName == "Beer" + X = vars["X"]; + Y = vars["Y"]; # G for some datasets + waves = 400:2:2250; + return X, Y, waves +elseif datasetName == "Dough" + X = [vars["Xtrain"]; vars["Xtest"]]; + Y = [vars["Ytrain"]; vars["Ytest"]]; + #if response != "all" + # Y = Y[:,response]; + # Y = dropdims(Y;dims=2); + #end + return X, Y +elseif datasetName == "FTIR_AMW_tidied" + X = vars["X"]; + Y = vars["Y"]; +elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP" + +elseif datasetName == "HPLCforweb" + X = vars["HPLCforweb"]["data"] +elseif datasetName == "leukemia1" + X = vars["X"]; + G = vars["G"]; +elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact" + +elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW! + X = vars["Ramandata"]; + Y = vars["PUFAdata"]; + waves = vars["Wavelengths_Raman"]; + return X, Y, waves +elseif datasetName == "NMR_WineData" + X = vars["X"] + Y = vars["Y"]; + ppm = vars["ppm"]; + return X, Y, ppm +elseif datasetName == "onion_NMR" + X = vars["x"] + Y = vars["onion"]; + ppm = vars["ppm"]; + return X, Y, ppm +elseif datasetName == "OVALung" + X = vars["X"]; + G = vars["y"]; + return X, G +elseif datasetName == "Raman_Adipose_fat_pig_original" + X = vars["spectra"] + Y = vars["fat"] + baseline = vars["baseline"] + waves = parse.(Float64,vars["wavelength"]) + return X, Y, waves, baseline +elseif datasetName == "Ramanmilk" + # There are replicates that need to be handled + #X = vars["SpectraR"] + #Y = vars["CLA"] +elseif datasetName == "RAMANPorkFat" + X = vars["X"]["data"] + Y = vars["Y"]["data"] + return X, Y +elseif datasetName == "spectra" + X = vars["NIR"]; + Y = vars["octane"]; + waves = 900:2:1700; + return X, Y, waves +elseif datasetName == "Sugar" + X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]]; + waves = vars["wave"]; + return X, Y, waves +elseif datasetName == "tumor11" + X = vars["X"]; + G = vars["G"]; +end +end \ No newline at end of file diff --git a/src/conveniencePlots.jl b/src/conveniencePlots.jl new file mode 100644 index 0000000..228403e --- /dev/null +++ b/src/conveniencePlots.jl @@ -0,0 +1,148 @@ +### TO-DO: Add support for arbitrary number of plots + +#= +X, _, waves = importData("Raman_Adipose_fat_plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false); +pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt);ig_original"); +X_SNV = SNV(X); +X_EMSC = EMSC(X,6); +plotr(waves, X, X_EMSC); +plotr(waves, X); +=# + + +function plotr(waves, X::Matrix{Float64}) + +plt = plot(waves, X', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +display(plt); +#return plt; +end + +function plotr(waves, X1::Matrix{Float64}, X2::Matrix{Float64}) + +plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function plotr(waves, X) + +plt = plot(waves, X["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +display(plt); +#return plt; +end + +function plotr(waves, X1, X2) + +plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function plotr(waves, X1::Matrix{Float64}, X2) + +plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function plotr(waves, X1, X2::Matrix{Float64}) + +plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function compPlot(X1::Matrix{Float64}, X2::Matrix{Float64}) + +plta = plot(X1', legend=false, grid=false); +pltb = plot(X2', legend=false, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function compPlot(X1::Vector{Float64}, X2::Vector{Float64}) + +plta = plot(X1, legend=false, grid=false); +pltb = plot(X2, legend=false, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function compPlot(X1, X2::Matrix{Float64}) + +plta = plot(X1["X_Cor"]', legend=false, grid=false); +pltb = plot(X2', legend=false, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function compPlot(X1::Matrix{Float64}, X2) + +plta = plot(X1', legend=false, grid=false); +pltb = plot(X2["X_Cor"]', legend=false, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function compPlot(X1, X2) + +plta = plot(X1["X_Cor"]', legend=false, grid=false); +pltb = plot(X2["X_Cor"]', legend=false, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +#return plt; +end + +function compPlot(X1, X2, X_3) + +plta = plot(X1["X_Cor"]', legend=false, grid=false); +pltb = plot(X2["X_Cor"]', legend=false, grid=false); +pltc = plot(X3["X_Cor"]', legend=false, grid=false); +plt = plot(plta, pltb, pltc, layout = (3,1)); +display(plt); +#return plt; +end + +function compPlot(X1::Matrix{Float64}, X2, X3) + +plta = plot(X1', legend=false, grid=false); +pltb = plot(X2["X_Cor"]', legend=false, grid=false); +pltc = plot(X3["X_Cor"]', legend=false, grid=false); +plt = plot(plta, pltb, pltc, layout = (3,1)); +display(plt); +#return plt; +end + +function compPlot(X1, X2::Matrix{Float64}, X3) + +plta = plot(X1["X_Cor"]', legend=false, grid=false); +pltb = plot(X2', legend=false, grid=false); +pltc = plot(X3["X_Cor"]', legend=false, grid=false); +plt = plot(plta, pltb, pltc, layout = (3,1)); +display(plt); +#return plt; +end + +function compPlot(X1, X2, X3::Matrix{Float64}) + +plta = plot(X1["X_Cor"]', legend=false, grid=false); +pltb = plot(X2["X_Cor"]', legend=false, grid=false); +pltc = plot(X3', legend=false, grid=false); +plt = plot(plta, pltb, pltc, layout = (3,1)); +display(plt); +#return plt; +end \ No newline at end of file diff --git a/src/preprocessing.jl b/src/preprocessing.jl new file mode 100644 index 0000000..091604d --- /dev/null +++ b/src/preprocessing.jl @@ -0,0 +1,388 @@ +#= +X, y = importData("Raman_Adipose_fat_pig_original"); # RAMANPorkFat +X_MSC = MSC(X); +X_MSCm = MSC(X, vec(mean(X, dims=1))); +X_SNV = SNV(X); +X_EMSC = EMSC(X, 6); +X_EMSCi = EMSC(X, 6, "svd", 1, -1, X[71,:]-X[39,:]); # Only for Raman_Adipose_fat_pig_original +X_EMSCb = EMSC(X, 6, "svd", 6, 0); +X_EMSCt = EMSCTraditional(X, 6); +X_base = findBaseline(X, 6); +X_SV1 = savitzkyGolay(X, 6, 5, 2, false); + +# compPlot(X_EMSC, X_EMSCi); # or do the below +plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false); +pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false); +plt = plot(plta, pltb, layout = (2,1)); +display(plt); +=# + + +""" + function plegendre(d, p) + +Calculates orthonormal Legendre polynomials using a QR factorisation. +Inputs: + - d : polynomial degree + - p : size of vector + +Outputs: + - Q : (d+1) x p matrix with basis + - R : matrix from QR-factorisation +""" + +function plegendre(d, p) + +P = ones(p, d+1); +x = (-1:2/(p-1):1)'; +for k in 1:d + P[:,k+1] = x.^k; +end + +factorisation = qr(P); +Q = Matrix(factorisation.Q)'; +R = Matrix(factorisation.R); + +return Q, R +end + + + +""" + function SNV(X) + +Standard Normal Variate (SNV) preprocessing (for each sample: subtract mean, divide by standard deviation) +Returns dictionary with keys X\\_Cor, means, stds. +""" +function SNV(X) + +X_SNV = zeros(size(X)); +means = mean(X, dims=2); +stds = std(X, dims=2); +X_SNV = @. (X - means) / stds; + +return_values = Dict([("X_Cor", X_SNV), ("means", means), ("stds", stds)]); +return return_values +end + + + + + + +""" + function MSC(X, ref::String="mean"||Vector{Float64}) + +MSC preprocessing (subtract constant trend, scale based on projection onto ref. spectrum) +Second argument is mean (default) or svd (first right singular vector), or a vector to be used as reference spectrum +Returns dictionary with keys X\\_Cor, X\\_Ref, coeffs +""" +function MSC(X, ref::String="mean") + +if ref == "mean" + X_Ref = vec(mean(X, dims=1)); +elseif ref == "svd" + u, s, v = svd(X, full=false); + X_Ref = v[:,1]; + if X_Ref[1] < 0 # Ad-hoc, attempts to correct for "flipped" spectra + X_Ref = -X_Ref; + end +end + +B = [ones(size(X,2), 1) X_Ref]; +coeffs = B \ X'; +X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:]; + +return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)]) +return return_values +end + + + +function MSC(X, X_Ref::Vector{Float64}) + +B = [ones(size(X,2), 1) X_Ref]; +coeffs = B \ X'; +X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:]; + +return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)]) +return return_values +end + + + + +""" + function EMSCTraditional(X, polDeg=2) + +EMSC correction with mean spectrum as reference and polynomial trends of form LinRange(-1,1,p).^i. +First argument is spectra to be corrected, second argument is either degree for polynomial regression +or a basis to be used for correction (basis used should be output of this function, the reference +spectrum is assumed to be the first basis vector). +Returns dictionary with keys X\\_Cor, basis, coeffs +""" +function EMSCTraditional(X, polDeg::Int64=2) + +n, p = size(X); +P = zeros(p, polDeg+1); +[P[:,i] = LinRange(-1, 1, p).^(i-1) for i in 1:polDeg+1]; +refSpec = mean(X, dims=1)'; +basis = [refSpec P]; + +return_values = EMSCTraditional(X, basis); +return return_values +end + + +""" + function EMSCTraditionalCorrection(X, basis) + +EMSC correction according to the given input basis that should be +the output of the EMSCTraditional function. +""" + +function EMSCTraditional(X, basis::Matrix{Float64}) + +coeffs = basis \ X'; +X_Cor = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:]; + +return_values = Dict([("X_Cor", X_Cor), ("basis", basis), ("coeffs", coeffs)]); +return return_values +end + + + + + + +""" + function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false) + +SavitzkyGolay filter. +Inputs: + X - data matrix + d - polynomial order + w - window size parameter (window size = 2*w+1) + der_order - derivative order + keep_endpoints - boolean, truncates spectra if false. NOTE: For derivatives MUST USE FALSE + +Output: Dictionary with keys X\\_Cor, der\\_order, degree, window\\_size, filter\\_coeffs, keep\\_endpoints +""" +function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false) + +if (d > 2*w+1) | (der_order > d) + error("Error! Must have d>2*w+1 and der_order > d") +end + +n, p = size(X); +Q, R = plegendre(d, 2*w+1); +a = factorial(der_order) * (R \ Q)[der_order + 1,:]; +X_Cor = zeros(n, p); +X_Cor[:, 1:w] = X[:, 1:w]; +X_Cor[:, end-w:end] = X[:, end-w:end]; + +for i in 1:n + for j in 1:p-2*w-1 + X_Cor[i,j+w] = X[i, j : j + 2*w]'a; + end +end + +if !keep_endpoints + X_Cor = X_Cor[:, w+1:end]; + X_Cor = X_Cor[:, 1:end-w-1]; +end + +return Dict([("X_Cor", X_Cor), ("der_order", der_order), ("degree", d), ("window_size", w), ("filter_coeffs", a), ("keep_endpoints", keep_endpoints)]); +end + + + + + + + + + + + +""" + function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2) + function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2) + +Finding and correcting baseline using iterative polynomial fits +Implementation of baseline correcting algorithm in Automated Method for +Subtraction of Fluorescence from Biological Raman Spectra (Lieber and +Mahadevan-Jansen) as described in Optimal choice of baseline correction +for multivariate calibration of spectra (Liland and Mevik). +NOTE: Changed criterion to while loop to relative error and lowered threshold considerably compared to MATLAB code. +The first function finds the baseline, the second functions calls the first function repeatedly on all rows in X. +""" +function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2) +p = length(v); +P, _ = plegendre(polDeg, p); +prevCoeffs = zeros(polDeg+1); +newCoeffs = ones(polDeg+1); +vcur = v; +vnew = copy(v); +nIter = 0 + +while (norm(P' * (prevCoeffs-newCoeffs))/norm(P'*prevCoeffs) > thresh) && (nIter < maxIter) + +prevCoeffs = newCoeffs; +newCoeffs = P' \ vnew; +vnew = P' * newCoeffs; + +for i in 1:p + if vcur[i] < vnew[i] + vnew[i] = vcur[i]; + end +end + +nIter = nIter + 1; +end + +if nIter == maxIter + println("Warning! Reached maximum number of iterations!"); +end + +baseline = vec(newCoeffs' * P); +coeffs = newCoeffs'; +v_cor = v - baseline; + +return v_cor, baseline, coeffs +end + + + +function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2) + +n, p = size(X); +X_Cor = zeros(n, p); +baseline = zeros(n,p); +coeffs = zeros(n, polDeg+1); + +for i=1:n + _, baseline[i,:], coeffs[i,:] = findBaseline(X[i,:], polDeg, maxIter, thresh); + X_Cor[i,:] = X[i,:] - baseline[i,:]; +end + +return_values = Dict([("baseline", baseline), ("X_Cor", X_Cor)]); +return return_values +end + + + + + + + + + + +""" + function EMSC(X, polDeg=2, refType="svd", nRef=1, baseDeg=-1, intF=0) + +Implementation of EMSC pre-processing based on Skogholt et al. (2018). +intF is 0 (no interferent), vector, or matrix with intereferents as rows. +Returns dictionary with keys X\\_Cor, model, coeffs +""" +function EMSC(X, polDeg=2, refType="svd", nRef=1, baseDeg=-1, intF=0) + +if intF == 0 + nIntF = 0; +elseif typeof(intF)==Vector{Float64} + intF = reshape(intF, 1, length(intF)); + nIntF = 1; +else + nIntF = size(intF, 1); +end + +n, p = size(X); +P, _ = plegendre(polDeg, p); +refSpec = zeros(nRef, p); +tot = polDeg + 1 + nIntF; + +if refType == "svd" + _, _, v = svd(X, full=false); + refSpec = v[:,1:nRef]; +elseif refType == "mean" + refSpec = mean(X, dims=1)'; +end + +# Correcting sign if ref. vectors are flipped. Ad-hoc but appears to work +# well in practice. +correlations = cor(X', refSpec); +for i in 1:nRef + if sum(correlations[:,i] .< 0) > n/2 + refSpec[i,:] = -refSpec[i,:]; + end +end + +# Creating basis, need ref. spectra at the end to correct for interferents +# and polynoial trends. +if nIntF == 0 + basis, R = qr([P' refSpec]); + basis = Matrix(basis)'; +else + basis, R = qr([P' intF' refSpec]); + basis = Matrix(basis)'; +end + +# Checking if QR factorisation "flipped" any of the vectors and if so flip them back. +for i in (tot+1):(tot+nRef) + if R[i,i] < 0 + R[i,:] = -R[i,:]; + basis[i,:] = -basis[i,:]; + end +end + +refPolPol = vec( R[1:tot, tot+1]' * basis[1:tot,:] / R[tot+1, tot+1]); + +# Optional baseline correction. Baseline calculated on (first) reference spectrum and same baseline is subtracted +# from all spectra (to not affect prediction). +if baseDeg > -1 + _, baseline = findBaseline(refSpec[:,1], baseDeg, 1000, 1e-4); +else + baseline = zeros(p); +end + +model = EMSCModel(basis, R, Vector([polDeg+1, nIntF, nRef]), refPolPol, baseline); +return_values = EMSCCorrection(X, model); + +return return_values +end + + + +""" + - EMSCCorrection(X, model::EMSCModel) + +Performs EMSC correction on X with given model. +Returns dictionary with keys X\\_Cor, model, coeffs +""" +function EMSCCorrection(X, model::EMSCModel) + +n, _ = size(X); + +# Calculate coefficients (utilizing orthogonl basis), +# Subtract projection onto vectors modelling unwanted additive stuff, +# Normalise L2-norm of spectra in subspace spanned by reference spectrum +tot = sum(model.basisExplanation[1:2]); +coeffs = X * model.basis'; +mult = sqrt.(sum(coeffs[:, tot+1:end].^2, dims=2)); +X_Cor = X - coeffs[:,1:tot] * model.basis[1:tot,:]; +X_Cor = X_Cor ./ mult; + +# Below is purely for visualisation. +# The spectra look strange without polynomial trends, +# so what is done below is adding back the polynomial trends to the reference spectrum. +# This is done by using the fact that R[1:tot, tot+1] contains the projection of the (first) +# reference spectrum onto the previous basis vectors. This allows us to easily add back the polynomial trends. +# As the same vector is added to each spectrum predictions ar eunaffected. +[X_Cor[i,:] = X_Cor[i,:] - model.refPolPol for i in 1:n]; +[X_Cor[i,:] = X_Cor[i,:] - model.baseline for i in 1:n]; # Surely this should be possible with broadcasting instead... +# Thought it would be + baseline above, but this works... + +return_values = Dict([("X_Cor", X_Cor), ("model", model), ("coeffs", coeffs)]); +return return_values +end \ No newline at end of file