Initial Commit

This commit is contained in:
Joakim Skogholt 2023-05-10 10:18:58 +02:00
parent 2387599b24
commit 974b59dcc3
5 changed files with 876 additions and 2 deletions

View file

@ -3,6 +3,16 @@ uuid = "be803360-cecc-4859-8120-03d0223bb960"
authors = ["Joakim"] authors = ["Joakim"]
version = "1.0.0-DEV" version = "1.0.0-DEV"
[deps]
LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
OptimizationBBO = "3e6eede4-6085-4f62-9a71-46d9bc1eb92b"
OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
[compat] [compat]
julia = "1" julia = "1"

View file

@ -1,5 +1,30 @@
module MinPakke module MinPakke
# Write your package code here. using LinearAlgebra
using Statistics
using Plots
using MAT
using Optimization
using OptimizationOptimJL
using OptimizationBBO
using LaTeXStrings
end export plegendre
export SNV
export MSC
export EMSCTraditional
export savitzkyGolay
export findBaseline
export EMSC
export EMSCCorrection
export plotr
export compPlot
include("preprocessing.jl")
include("convenience.jl")
include("conveniencePlots.jl")
end

303
src/convenience.jl Normal file
View file

@ -0,0 +1,303 @@
#=
X, Y = importData("spectra");
splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42);
split = createDataSplit(splits, nTrainValTest, 5, X, Y);
size(split["XTrain"])
###
X, G = importData("OVALung");
splitStrat, nTVT = createDataSplitBinaryStratified(G, 5);
my_split = createDataSplit(splitStrat, nTVT, 2, X, G);
XVal = my_split["XVal"];
=#
"""
createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
Creates training/validation/test split for dataset
### Arguments
- 'X' : Data matrix or integer indicating number of samples
- 'nSplits' : Int, number of data splits to create
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
- 'rngseed' : Int, for reproducibility
### Returns
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
"""
function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
n = size(X, 1);
nTrainValTest = zeros(3);
splits = convert(Matrix{Int64}, zeros(n, nSplits));
if abs(sum(props) - 1) > 1e-14
error("Error! Proportions do not add up to 1");
end
nTrain = convert(Int64, floor(n * props[1]));
nVal = convert(Int64, floor(n * props[2]));
nTest = convert(Int64, floor(n * props[3]));
nLeft = n - (nTrain + nVal + nTest);
nTrain = nTrain + nLeft
Random.seed!(rngseed);
for i in 1:nSplits
inds = randperm(n);
splits[1:nTrain , i] = inds[1:nTrain];
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
end
nTrainValTest = [nTrain, nVal, nTest];
return splits, nTrainValTest
end
"""
createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
Creates training/validation/test split for dataset
### Arguments
- 'X' : Int, number of samples
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
- 'nSplits' : Int, number of data splits
- 'rngseed' : Int, for reproducibility
### Returns
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
"""
function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
n = X;
nTrainValTest = zeros(3);
splits = convert(Matrix{Int64}, zeros(n, nSplits));
if abs(sum(props) - 1) > 1e-14
error("Error! Proportions do not add up to 1");
end
nTrain = convert(Int64, floor(n * props[1]));
nVal = convert(Int64, floor(n * props[2]));
nTest = convert(Int64, floor(n * props[3]));
nLeft = n - (nTrain + nVal + nTest);
nTrain = nTrain + nLeft
Random.seed!(rngseed);
for i in 1:nSplits
inds = randperm(n);
splits[1:nTrain , i] = inds[1:nTrain];
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
end
nTrainValTest = [nTrain, nVal, nTest];
return splits, nTrainValTest
end
"""
createDataSplit(splits, nTrainValTest, splitInd, X, Y)
Creates training/validation/test split for dataset
### Arguments
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
- 'splitInd' : Index of split to use
- 'X' and 'Y' : Data matrix and response vector/matrix
### Returns
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
"""
function createDataSplit(splits, nTrainValTest, splitInd, X, Y)
XTrain = X[splits[1 : nTrainValTest[1], splitInd], :];
XVal = X[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
XTest = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
YTrain = Y[splits[1 : nTrainValTest[1], splitInd], :];
YVal = Y[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
YTest = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]);
end
"""
createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2
### Arguments
- 'G' : Vector{Int64}, group membership coded as 1, 2
- 'nSplits' : Int, number of data splits
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
- 'rngseed' : Int, for reproducibility
### Returns
- 'splits' : size(X,1) x nSplits matrix with indices
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
"""
function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
n = length(G);
splits = convert(Matrix{Int64}, zeros(n, nSplits));
indsG1 = findall( @. abs(G - 1.0) < 1e-14)
indsG2 = findall( @. abs(G - 2.0) < 1e-14)
nG1 = length(indsG1);
nG2 = length(indsG2);
propG1 = nG1 / n
propG2 = 1 - propG1;
nTrain1 = convert(Int64, floor(n * props[1] * propG1));
nVal1 = convert(Int64, floor(n * props[2] * propG1));
nTest1 = convert(Int64, floor(n * props[3] * propG1));
nTrain2 = convert(Int64, floor(n * props[1] * propG2));
nVal2 = convert(Int64, floor(n * props[2] * propG2));
nTest2 = convert(Int64, floor(n * props[3] * propG2));
# Unassigned samples (if any) are added to the training set)
nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1);
nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2);
nTrain1 = nTrain1 + nLeftG1;
nTrain2 = nTrain2 + nLeftG2;
nTrain = nTrain1 + nTrain2;
nVal = nVal1 + nVal2;
nTest = nTest1 + nTest2;
if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n
error("Error! Some samples are not assigned!");
end
Random.seed!(rngseed);
for i in 1:nSplits
G1IndsShuffle = randperm(nG1);
G2IndsShuffle = randperm(nG2);
splits[1 : nTrain , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]];
splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]];
splits[nTrain+nVal+1 : end , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60
end
nTrainValTest = [nTrain, nVal, nTest];
return splits, nTrainValTest
end
"""
importData(datasetName, datasetPath="/home/jovyan/Datasets/")
Example: X, y, waves = importData("Beer");
Returns X, Y/G, [waves], +more for some datasets
Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb,
leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original,
Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11
"""
function importData(datasetName, datasetPath="/home/jovyan/Datasets/")
#datasetName = "Beer"
#datasetPath = "/home/jovyan/Datasets/"
vars = matread(string(datasetPath,datasetName,".mat"));
if datasetName == "9tumors"
X = vars["X"];
G = vars["G"];
elseif datasetName == "bacteria_K4_K5_K6"
# Causes crash for some reason
elseif datasetName == "Beer"
X = vars["X"];
Y = vars["Y"]; # G for some datasets
waves = 400:2:2250;
return X, Y, waves
elseif datasetName == "Dough"
X = [vars["Xtrain"]; vars["Xtest"]];
Y = [vars["Ytrain"]; vars["Ytest"]];
#if response != "all"
# Y = Y[:,response];
# Y = dropdims(Y;dims=2);
#end
return X, Y
elseif datasetName == "FTIR_AMW_tidied"
X = vars["X"];
Y = vars["Y"];
elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP"
elseif datasetName == "HPLCforweb"
X = vars["HPLCforweb"]["data"]
elseif datasetName == "leukemia1"
X = vars["X"];
G = vars["G"];
elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact"
elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW!
X = vars["Ramandata"];
Y = vars["PUFAdata"];
waves = vars["Wavelengths_Raman"];
return X, Y, waves
elseif datasetName == "NMR_WineData"
X = vars["X"]
Y = vars["Y"];
ppm = vars["ppm"];
return X, Y, ppm
elseif datasetName == "onion_NMR"
X = vars["x"]
Y = vars["onion"];
ppm = vars["ppm"];
return X, Y, ppm
elseif datasetName == "OVALung"
X = vars["X"];
G = vars["y"];
return X, G
elseif datasetName == "Raman_Adipose_fat_pig_original"
X = vars["spectra"]
Y = vars["fat"]
baseline = vars["baseline"]
waves = parse.(Float64,vars["wavelength"])
return X, Y, waves, baseline
elseif datasetName == "Ramanmilk"
# There are replicates that need to be handled
#X = vars["SpectraR"]
#Y = vars["CLA"]
elseif datasetName == "RAMANPorkFat"
X = vars["X"]["data"]
Y = vars["Y"]["data"]
return X, Y
elseif datasetName == "spectra"
X = vars["NIR"];
Y = vars["octane"];
waves = 900:2:1700;
return X, Y, waves
elseif datasetName == "Sugar"
X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]];
waves = vars["wave"];
return X, Y, waves
elseif datasetName == "tumor11"
X = vars["X"];
G = vars["G"];
end
end

148
src/conveniencePlots.jl Normal file
View file

@ -0,0 +1,148 @@
### TO-DO: Add support for arbitrary number of plots
#=
X, _, waves = importData("Raman_Adipose_fat_plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);ig_original");
X_SNV = SNV(X);
X_EMSC = EMSC(X,6);
plotr(waves, X, X_EMSC);
plotr(waves, X);
=#
function plotr(waves, X::Matrix{Float64})
plt = plot(waves, X', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
display(plt);
#return plt;
end
function plotr(waves, X1::Matrix{Float64}, X2::Matrix{Float64})
plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function plotr(waves, X)
plt = plot(waves, X["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
display(plt);
#return plt;
end
function plotr(waves, X1, X2)
plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function plotr(waves, X1::Matrix{Float64}, X2)
plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function plotr(waves, X1, X2::Matrix{Float64})
plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function compPlot(X1::Matrix{Float64}, X2::Matrix{Float64})
plta = plot(X1', legend=false, grid=false);
pltb = plot(X2', legend=false, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function compPlot(X1::Vector{Float64}, X2::Vector{Float64})
plta = plot(X1, legend=false, grid=false);
pltb = plot(X2, legend=false, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function compPlot(X1, X2::Matrix{Float64})
plta = plot(X1["X_Cor"]', legend=false, grid=false);
pltb = plot(X2', legend=false, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function compPlot(X1::Matrix{Float64}, X2)
plta = plot(X1', legend=false, grid=false);
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function compPlot(X1, X2)
plta = plot(X1["X_Cor"]', legend=false, grid=false);
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
#return plt;
end
function compPlot(X1, X2, X_3)
plta = plot(X1["X_Cor"]', legend=false, grid=false);
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
pltc = plot(X3["X_Cor"]', legend=false, grid=false);
plt = plot(plta, pltb, pltc, layout = (3,1));
display(plt);
#return plt;
end
function compPlot(X1::Matrix{Float64}, X2, X3)
plta = plot(X1', legend=false, grid=false);
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
pltc = plot(X3["X_Cor"]', legend=false, grid=false);
plt = plot(plta, pltb, pltc, layout = (3,1));
display(plt);
#return plt;
end
function compPlot(X1, X2::Matrix{Float64}, X3)
plta = plot(X1["X_Cor"]', legend=false, grid=false);
pltb = plot(X2', legend=false, grid=false);
pltc = plot(X3["X_Cor"]', legend=false, grid=false);
plt = plot(plta, pltb, pltc, layout = (3,1));
display(plt);
#return plt;
end
function compPlot(X1, X2, X3::Matrix{Float64})
plta = plot(X1["X_Cor"]', legend=false, grid=false);
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
pltc = plot(X3', legend=false, grid=false);
plt = plot(plta, pltb, pltc, layout = (3,1));
display(plt);
#return plt;
end

388
src/preprocessing.jl Normal file
View file

@ -0,0 +1,388 @@
#=
X, y = importData("Raman_Adipose_fat_pig_original"); # RAMANPorkFat
X_MSC = MSC(X);
X_MSCm = MSC(X, vec(mean(X, dims=1)));
X_SNV = SNV(X);
X_EMSC = EMSC(X, 6);
X_EMSCi = EMSC(X, 6, "svd", 1, -1, X[71,:]-X[39,:]); # Only for Raman_Adipose_fat_pig_original
X_EMSCb = EMSC(X, 6, "svd", 6, 0);
X_EMSCt = EMSCTraditional(X, 6);
X_base = findBaseline(X, 6);
X_SV1 = savitzkyGolay(X, 6, 5, 2, false);
# compPlot(X_EMSC, X_EMSCi); # or do the below
plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
plt = plot(plta, pltb, layout = (2,1));
display(plt);
=#
"""
function plegendre(d, p)
Calculates orthonormal Legendre polynomials using a QR factorisation.
Inputs:
- d : polynomial degree
- p : size of vector
Outputs:
- Q : (d+1) x p matrix with basis
- R : matrix from QR-factorisation
"""
function plegendre(d, p)
P = ones(p, d+1);
x = (-1:2/(p-1):1)';
for k in 1:d
P[:,k+1] = x.^k;
end
factorisation = qr(P);
Q = Matrix(factorisation.Q)';
R = Matrix(factorisation.R);
return Q, R
end
"""
function SNV(X)
Standard Normal Variate (SNV) preprocessing (for each sample: subtract mean, divide by standard deviation)
Returns dictionary with keys X\\_Cor, means, stds.
"""
function SNV(X)
X_SNV = zeros(size(X));
means = mean(X, dims=2);
stds = std(X, dims=2);
X_SNV = @. (X - means) / stds;
return_values = Dict([("X_Cor", X_SNV), ("means", means), ("stds", stds)]);
return return_values
end
"""
function MSC(X, ref::String="mean"||Vector{Float64})
MSC preprocessing (subtract constant trend, scale based on projection onto ref. spectrum)
Second argument is mean (default) or svd (first right singular vector), or a vector to be used as reference spectrum
Returns dictionary with keys X\\_Cor, X\\_Ref, coeffs
"""
function MSC(X, ref::String="mean")
if ref == "mean"
X_Ref = vec(mean(X, dims=1));
elseif ref == "svd"
u, s, v = svd(X, full=false);
X_Ref = v[:,1];
if X_Ref[1] < 0 # Ad-hoc, attempts to correct for "flipped" spectra
X_Ref = -X_Ref;
end
end
B = [ones(size(X,2), 1) X_Ref];
coeffs = B \ X';
X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:];
return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
return return_values
end
function MSC(X, X_Ref::Vector{Float64})
B = [ones(size(X,2), 1) X_Ref];
coeffs = B \ X';
X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:];
return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
return return_values
end
"""
function EMSCTraditional(X, polDeg=2)
EMSC correction with mean spectrum as reference and polynomial trends of form LinRange(-1,1,p).^i.
First argument is spectra to be corrected, second argument is either degree for polynomial regression
or a basis to be used for correction (basis used should be output of this function, the reference
spectrum is assumed to be the first basis vector).
Returns dictionary with keys X\\_Cor, basis, coeffs
"""
function EMSCTraditional(X, polDeg::Int64=2)
n, p = size(X);
P = zeros(p, polDeg+1);
[P[:,i] = LinRange(-1, 1, p).^(i-1) for i in 1:polDeg+1];
refSpec = mean(X, dims=1)';
basis = [refSpec P];
return_values = EMSCTraditional(X, basis);
return return_values
end
"""
function EMSCTraditionalCorrection(X, basis)
EMSC correction according to the given input basis that should be
the output of the EMSCTraditional function.
"""
function EMSCTraditional(X, basis::Matrix{Float64})
coeffs = basis \ X';
X_Cor = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:];
return_values = Dict([("X_Cor", X_Cor), ("basis", basis), ("coeffs", coeffs)]);
return return_values
end
"""
function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
SavitzkyGolay filter.
Inputs:
X - data matrix
d - polynomial order
w - window size parameter (window size = 2*w+1)
der_order - derivative order
keep_endpoints - boolean, truncates spectra if false. NOTE: For derivatives MUST USE FALSE
Output: Dictionary with keys X\\_Cor, der\\_order, degree, window\\_size, filter\\_coeffs, keep\\_endpoints
"""
function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
if (d > 2*w+1) | (der_order > d)
error("Error! Must have d>2*w+1 and der_order > d")
end
n, p = size(X);
Q, R = plegendre(d, 2*w+1);
a = factorial(der_order) * (R \ Q)[der_order + 1,:];
X_Cor = zeros(n, p);
X_Cor[:, 1:w] = X[:, 1:w];
X_Cor[:, end-w:end] = X[:, end-w:end];
for i in 1:n
for j in 1:p-2*w-1
X_Cor[i,j+w] = X[i, j : j + 2*w]'a;
end
end
if !keep_endpoints
X_Cor = X_Cor[:, w+1:end];
X_Cor = X_Cor[:, 1:end-w-1];
end
return Dict([("X_Cor", X_Cor), ("der_order", der_order), ("degree", d), ("window_size", w), ("filter_coeffs", a), ("keep_endpoints", keep_endpoints)]);
end
"""
function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
Finding and correcting baseline using iterative polynomial fits
Implementation of baseline correcting algorithm in Automated Method for
Subtraction of Fluorescence from Biological Raman Spectra (Lieber and
Mahadevan-Jansen) as described in Optimal choice of baseline correction
for multivariate calibration of spectra (Liland and Mevik).
NOTE: Changed criterion to while loop to relative error and lowered threshold considerably compared to MATLAB code.
The first function finds the baseline, the second functions calls the first function repeatedly on all rows in X.
"""
function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
p = length(v);
P, _ = plegendre(polDeg, p);
prevCoeffs = zeros(polDeg+1);
newCoeffs = ones(polDeg+1);
vcur = v;
vnew = copy(v);
nIter = 0
while (norm(P' * (prevCoeffs-newCoeffs))/norm(P'*prevCoeffs) > thresh) && (nIter < maxIter)
prevCoeffs = newCoeffs;
newCoeffs = P' \ vnew;
vnew = P' * newCoeffs;
for i in 1:p
if vcur[i] < vnew[i]
vnew[i] = vcur[i];
end
end
nIter = nIter + 1;
end
if nIter == maxIter
println("Warning! Reached maximum number of iterations!");
end
baseline = vec(newCoeffs' * P);
coeffs = newCoeffs';
v_cor = v - baseline;
return v_cor, baseline, coeffs
end
function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
n, p = size(X);
X_Cor = zeros(n, p);
baseline = zeros(n,p);
coeffs = zeros(n, polDeg+1);
for i=1:n
_, baseline[i,:], coeffs[i,:] = findBaseline(X[i,:], polDeg, maxIter, thresh);
X_Cor[i,:] = X[i,:] - baseline[i,:];
end
return_values = Dict([("baseline", baseline), ("X_Cor", X_Cor)]);
return return_values
end
"""
function EMSC(X, polDeg=2, refType="svd", nRef=1, baseDeg=-1, intF=0)
Implementation of EMSC pre-processing based on Skogholt et al. (2018).
intF is 0 (no interferent), vector, or matrix with intereferents as rows.
Returns dictionary with keys X\\_Cor, model, coeffs
"""
function EMSC(X, polDeg=2, refType="svd", nRef=1, baseDeg=-1, intF=0)
if intF == 0
nIntF = 0;
elseif typeof(intF)==Vector{Float64}
intF = reshape(intF, 1, length(intF));
nIntF = 1;
else
nIntF = size(intF, 1);
end
n, p = size(X);
P, _ = plegendre(polDeg, p);
refSpec = zeros(nRef, p);
tot = polDeg + 1 + nIntF;
if refType == "svd"
_, _, v = svd(X, full=false);
refSpec = v[:,1:nRef];
elseif refType == "mean"
refSpec = mean(X, dims=1)';
end
# Correcting sign if ref. vectors are flipped. Ad-hoc but appears to work
# well in practice.
correlations = cor(X', refSpec);
for i in 1:nRef
if sum(correlations[:,i] .< 0) > n/2
refSpec[i,:] = -refSpec[i,:];
end
end
# Creating basis, need ref. spectra at the end to correct for interferents
# and polynoial trends.
if nIntF == 0
basis, R = qr([P' refSpec]);
basis = Matrix(basis)';
else
basis, R = qr([P' intF' refSpec]);
basis = Matrix(basis)';
end
# Checking if QR factorisation "flipped" any of the vectors and if so flip them back.
for i in (tot+1):(tot+nRef)
if R[i,i] < 0
R[i,:] = -R[i,:];
basis[i,:] = -basis[i,:];
end
end
refPolPol = vec( R[1:tot, tot+1]' * basis[1:tot,:] / R[tot+1, tot+1]);
# Optional baseline correction. Baseline calculated on (first) reference spectrum and same baseline is subtracted
# from all spectra (to not affect prediction).
if baseDeg > -1
_, baseline = findBaseline(refSpec[:,1], baseDeg, 1000, 1e-4);
else
baseline = zeros(p);
end
model = EMSCModel(basis, R, Vector([polDeg+1, nIntF, nRef]), refPolPol, baseline);
return_values = EMSCCorrection(X, model);
return return_values
end
"""
- EMSCCorrection(X, model::EMSCModel)
Performs EMSC correction on X with given model.
Returns dictionary with keys X\\_Cor, model, coeffs
"""
function EMSCCorrection(X, model::EMSCModel)
n, _ = size(X);
# Calculate coefficients (utilizing orthogonl basis),
# Subtract projection onto vectors modelling unwanted additive stuff,
# Normalise L2-norm of spectra in subspace spanned by reference spectrum
tot = sum(model.basisExplanation[1:2]);
coeffs = X * model.basis';
mult = sqrt.(sum(coeffs[:, tot+1:end].^2, dims=2));
X_Cor = X - coeffs[:,1:tot] * model.basis[1:tot,:];
X_Cor = X_Cor ./ mult;
# Below is purely for visualisation.
# The spectra look strange without polynomial trends,
# so what is done below is adding back the polynomial trends to the reference spectrum.
# This is done by using the fact that R[1:tot, tot+1] contains the projection of the (first)
# reference spectrum onto the previous basis vectors. This allows us to easily add back the polynomial trends.
# As the same vector is added to each spectrum predictions ar eunaffected.
[X_Cor[i,:] = X_Cor[i,:] - model.refPolPol for i in 1:n];
[X_Cor[i,:] = X_Cor[i,:] - model.baseline for i in 1:n]; # Surely this should be possible with broadcasting instead...
# Thought it would be + baseline above, but this works...
return_values = Dict([("X_Cor", X_Cor), ("model", model), ("coeffs", coeffs)]);
return return_values
end