Initial Commit
This commit is contained in:
parent
2387599b24
commit
974b59dcc3
5 changed files with 876 additions and 2 deletions
10
Project.toml
10
Project.toml
|
|
@ -3,6 +3,16 @@ uuid = "be803360-cecc-4859-8120-03d0223bb960"
|
||||||
authors = ["Joakim"]
|
authors = ["Joakim"]
|
||||||
version = "1.0.0-DEV"
|
version = "1.0.0-DEV"
|
||||||
|
|
||||||
|
[deps]
|
||||||
|
LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
|
||||||
|
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
|
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
|
||||||
|
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
|
||||||
|
OptimizationBBO = "3e6eede4-6085-4f62-9a71-46d9bc1eb92b"
|
||||||
|
OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
|
||||||
|
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
|
||||||
|
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
||||||
|
|
||||||
[compat]
|
[compat]
|
||||||
julia = "1"
|
julia = "1"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,30 @@
|
||||||
module MinPakke
|
module MinPakke
|
||||||
|
|
||||||
# Write your package code here.
|
using LinearAlgebra
|
||||||
|
using Statistics
|
||||||
|
using Plots
|
||||||
|
using MAT
|
||||||
|
using Optimization
|
||||||
|
using OptimizationOptimJL
|
||||||
|
using OptimizationBBO
|
||||||
|
using LaTeXStrings
|
||||||
|
|
||||||
|
export plegendre
|
||||||
|
export SNV
|
||||||
|
export MSC
|
||||||
|
export EMSCTraditional
|
||||||
|
export savitzkyGolay
|
||||||
|
export findBaseline
|
||||||
|
export EMSC
|
||||||
|
export EMSCCorrection
|
||||||
|
|
||||||
|
export plotr
|
||||||
|
export compPlot
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
include("preprocessing.jl")
|
||||||
|
include("convenience.jl")
|
||||||
|
include("conveniencePlots.jl")
|
||||||
|
|
||||||
end
|
end
|
||||||
303
src/convenience.jl
Normal file
303
src/convenience.jl
Normal file
|
|
@ -0,0 +1,303 @@
|
||||||
|
#=
|
||||||
|
X, Y = importData("spectra");
|
||||||
|
splits, nTrainValTest = createDataSplitInds(X, 10, [6/10, 2/10, 2/10], 42);
|
||||||
|
split = createDataSplit(splits, nTrainValTest, 5, X, Y);
|
||||||
|
size(split["XTrain"])
|
||||||
|
|
||||||
|
###
|
||||||
|
|
||||||
|
X, G = importData("OVALung");
|
||||||
|
splitStrat, nTVT = createDataSplitBinaryStratified(G, 5);
|
||||||
|
my_split = createDataSplit(splitStrat, nTVT, 2, X, G);
|
||||||
|
XVal = my_split["XVal"];
|
||||||
|
=#
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
||||||
|
createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
||||||
|
|
||||||
|
|
||||||
|
Creates training/validation/test split for dataset
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
- 'X' : Data matrix or integer indicating number of samples
|
||||||
|
- 'nSplits' : Int, number of data splits to create
|
||||||
|
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
|
||||||
|
- 'rngseed' : Int, for reproducibility
|
||||||
|
|
||||||
|
### Returns
|
||||||
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
||||||
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
||||||
|
"""
|
||||||
|
function createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
||||||
|
|
||||||
|
n = size(X, 1);
|
||||||
|
nTrainValTest = zeros(3);
|
||||||
|
splits = convert(Matrix{Int64}, zeros(n, nSplits));
|
||||||
|
|
||||||
|
if abs(sum(props) - 1) > 1e-14
|
||||||
|
error("Error! Proportions do not add up to 1");
|
||||||
|
end
|
||||||
|
|
||||||
|
nTrain = convert(Int64, floor(n * props[1]));
|
||||||
|
nVal = convert(Int64, floor(n * props[2]));
|
||||||
|
nTest = convert(Int64, floor(n * props[3]));
|
||||||
|
nLeft = n - (nTrain + nVal + nTest);
|
||||||
|
nTrain = nTrain + nLeft
|
||||||
|
|
||||||
|
Random.seed!(rngseed);
|
||||||
|
|
||||||
|
for i in 1:nSplits
|
||||||
|
inds = randperm(n);
|
||||||
|
|
||||||
|
splits[1:nTrain , i] = inds[1:nTrain];
|
||||||
|
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
|
||||||
|
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
|
||||||
|
end
|
||||||
|
|
||||||
|
nTrainValTest = [nTrain, nVal, nTest];
|
||||||
|
|
||||||
|
return splits, nTrainValTest
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
createDataSplitInds(X, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
||||||
|
|
||||||
|
Creates training/validation/test split for dataset
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
- 'X' : Int, number of samples
|
||||||
|
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
|
||||||
|
- 'nSplits' : Int, number of data splits
|
||||||
|
- 'rngseed' : Int, for reproducibility
|
||||||
|
|
||||||
|
### Returns
|
||||||
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
||||||
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
||||||
|
"""
|
||||||
|
function createDataSplitInds(X::Int64, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
||||||
|
|
||||||
|
n = X;
|
||||||
|
nTrainValTest = zeros(3);
|
||||||
|
splits = convert(Matrix{Int64}, zeros(n, nSplits));
|
||||||
|
|
||||||
|
if abs(sum(props) - 1) > 1e-14
|
||||||
|
error("Error! Proportions do not add up to 1");
|
||||||
|
end
|
||||||
|
|
||||||
|
nTrain = convert(Int64, floor(n * props[1]));
|
||||||
|
nVal = convert(Int64, floor(n * props[2]));
|
||||||
|
nTest = convert(Int64, floor(n * props[3]));
|
||||||
|
nLeft = n - (nTrain + nVal + nTest);
|
||||||
|
nTrain = nTrain + nLeft
|
||||||
|
|
||||||
|
Random.seed!(rngseed);
|
||||||
|
|
||||||
|
for i in 1:nSplits
|
||||||
|
inds = randperm(n);
|
||||||
|
|
||||||
|
splits[1:nTrain , i] = inds[1:nTrain];
|
||||||
|
splits[nTrain+1 : nTrain + nVal, i] = inds[nTrain+1 : nTrain + nVal];
|
||||||
|
splits[nTrain+nVal+1:end , i] = inds[nTrain+nVal+1 : end];
|
||||||
|
end
|
||||||
|
|
||||||
|
nTrainValTest = [nTrain, nVal, nTest];
|
||||||
|
|
||||||
|
return splits, nTrainValTest
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
createDataSplit(splits, nTrainValTest, splitInd, X, Y)
|
||||||
|
|
||||||
|
Creates training/validation/test split for dataset
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
||||||
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
||||||
|
- 'splitInd' : Index of split to use
|
||||||
|
- 'X' and 'Y' : Data matrix and response vector/matrix
|
||||||
|
|
||||||
|
### Returns
|
||||||
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
||||||
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
||||||
|
"""
|
||||||
|
function createDataSplit(splits, nTrainValTest, splitInd, X, Y)
|
||||||
|
|
||||||
|
XTrain = X[splits[1 : nTrainValTest[1], splitInd], :];
|
||||||
|
XVal = X[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
|
||||||
|
XTest = X[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
|
||||||
|
|
||||||
|
YTrain = Y[splits[1 : nTrainValTest[1], splitInd], :];
|
||||||
|
YVal = Y[splits[nTrainValTest[1] + 1 : nTrainValTest[1] + nTrainValTest[2], splitInd], :];
|
||||||
|
YTest = Y[splits[nTrainValTest[1] + nTrainValTest[2]+1 : end, splitInd], :];
|
||||||
|
|
||||||
|
return Dict([("XTrain", XTrain), ("XVal", XVal), ("XTest", XTest), ("YTrain", YTrain), ("YVal", YVal), ("YTest", YTest)]);
|
||||||
|
end
|
||||||
|
|
||||||
|
"""
|
||||||
|
createDataSplitBinaryStratified(G::Vector{Int64}, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
||||||
|
|
||||||
|
Creates stratified training/validation/test split for dataset. Assumes G is coded as 1, 2
|
||||||
|
|
||||||
|
### Arguments
|
||||||
|
- 'G' : Vector{Int64}, group membership coded as 1, 2
|
||||||
|
- 'nSplits' : Int, number of data splits
|
||||||
|
- 'props' : 3x1 Vector of proportions (train/val/test), must sum to 1
|
||||||
|
- 'rngseed' : Int, for reproducibility
|
||||||
|
|
||||||
|
### Returns
|
||||||
|
- 'splits' : size(X,1) x nSplits matrix with indices
|
||||||
|
- 'nTrainValTest' : 3x1 vector indicating number of samples in training/validation/testing
|
||||||
|
"""
|
||||||
|
function createDataSplitBinaryStratified(G::Vector, nSplits, props=[6/10, 2/10, 2/10], rngseed=42)
|
||||||
|
|
||||||
|
n = length(G);
|
||||||
|
splits = convert(Matrix{Int64}, zeros(n, nSplits));
|
||||||
|
|
||||||
|
indsG1 = findall( @. abs(G - 1.0) < 1e-14)
|
||||||
|
indsG2 = findall( @. abs(G - 2.0) < 1e-14)
|
||||||
|
|
||||||
|
nG1 = length(indsG1);
|
||||||
|
nG2 = length(indsG2);
|
||||||
|
|
||||||
|
propG1 = nG1 / n
|
||||||
|
propG2 = 1 - propG1;
|
||||||
|
|
||||||
|
nTrain1 = convert(Int64, floor(n * props[1] * propG1));
|
||||||
|
nVal1 = convert(Int64, floor(n * props[2] * propG1));
|
||||||
|
nTest1 = convert(Int64, floor(n * props[3] * propG1));
|
||||||
|
nTrain2 = convert(Int64, floor(n * props[1] * propG2));
|
||||||
|
nVal2 = convert(Int64, floor(n * props[2] * propG2));
|
||||||
|
nTest2 = convert(Int64, floor(n * props[3] * propG2));
|
||||||
|
|
||||||
|
# Unassigned samples (if any) are added to the training set)
|
||||||
|
nLeftG1 = nG1 - (nTrain1 + nVal1 + nTest1);
|
||||||
|
nLeftG2 = nG2 - (nTrain2 + nVal2 + nTest2);
|
||||||
|
|
||||||
|
nTrain1 = nTrain1 + nLeftG1;
|
||||||
|
nTrain2 = nTrain2 + nLeftG2;
|
||||||
|
|
||||||
|
nTrain = nTrain1 + nTrain2;
|
||||||
|
nVal = nVal1 + nVal2;
|
||||||
|
nTest = nTest1 + nTest2;
|
||||||
|
|
||||||
|
if (nTrain1+nTrain2+nVal1+nVal2+nTest1+nTest2) != n
|
||||||
|
error("Error! Some samples are not assigned!");
|
||||||
|
end
|
||||||
|
|
||||||
|
Random.seed!(rngseed);
|
||||||
|
|
||||||
|
for i in 1:nSplits
|
||||||
|
G1IndsShuffle = randperm(nG1);
|
||||||
|
G2IndsShuffle = randperm(nG2);
|
||||||
|
|
||||||
|
splits[1 : nTrain , i] = [indsG1[G1IndsShuffle[1:nTrain1]]; indsG2[G2IndsShuffle[1:nTrain2]]];
|
||||||
|
splits[nTrain+1 : nTrain + nVal, i] = [indsG1[G1IndsShuffle[nTrain1 + 1 : nTrain1 + nVal1]]; indsG2[G2IndsShuffle[nTrain2 + 1 : nTrain2 + nVal2]]];
|
||||||
|
splits[nTrain+nVal+1 : end , i] = [indsG1[G1IndsShuffle[nTrain1 + nVal1 + 1 : end]]; indsG2[G2IndsShuffle[nTrain2 + nVal2 + 1 : end]]]; # 60
|
||||||
|
end
|
||||||
|
|
||||||
|
nTrainValTest = [nTrain, nVal, nTest];
|
||||||
|
|
||||||
|
return splits, nTrainValTest
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
importData(datasetName, datasetPath="/home/jovyan/Datasets/")
|
||||||
|
|
||||||
|
Example: X, y, waves = importData("Beer");
|
||||||
|
Returns X, Y/G, [waves], +more for some datasets
|
||||||
|
Valid values for datasetName: 9tumors, bacteria\\_K4\\_K5\\_K6, Beer, Dough, FTIR\\_AMW\\_tidied, FTIR\\_FTIR\\_FTIR\\_AFLP\\_SakacinP, HPLCforweb,
|
||||||
|
leukemia1, MALDI-TOF\\_melk\\_rep\\_6179\\_corr\\_exact, NIR\\_Raman\\_PUFA\\_[NIR/Raman], NMR\\_WineData, onion\\_NMR, OVALung, Raman\\_Adipose\\_fat\\_pig\\_original,
|
||||||
|
Ramanmilk, RAMANPorkFat, spectra, Sugar, tumor11
|
||||||
|
"""
|
||||||
|
function importData(datasetName, datasetPath="/home/jovyan/Datasets/")
|
||||||
|
|
||||||
|
#datasetName = "Beer"
|
||||||
|
#datasetPath = "/home/jovyan/Datasets/"
|
||||||
|
vars = matread(string(datasetPath,datasetName,".mat"));
|
||||||
|
|
||||||
|
if datasetName == "9tumors"
|
||||||
|
X = vars["X"];
|
||||||
|
G = vars["G"];
|
||||||
|
elseif datasetName == "bacteria_K4_K5_K6"
|
||||||
|
# Causes crash for some reason
|
||||||
|
elseif datasetName == "Beer"
|
||||||
|
X = vars["X"];
|
||||||
|
Y = vars["Y"]; # G for some datasets
|
||||||
|
waves = 400:2:2250;
|
||||||
|
return X, Y, waves
|
||||||
|
elseif datasetName == "Dough"
|
||||||
|
X = [vars["Xtrain"]; vars["Xtest"]];
|
||||||
|
Y = [vars["Ytrain"]; vars["Ytest"]];
|
||||||
|
#if response != "all"
|
||||||
|
# Y = Y[:,response];
|
||||||
|
# Y = dropdims(Y;dims=2);
|
||||||
|
#end
|
||||||
|
return X, Y
|
||||||
|
elseif datasetName == "FTIR_AMW_tidied"
|
||||||
|
X = vars["X"];
|
||||||
|
Y = vars["Y"];
|
||||||
|
elseif datasetName == "FTIR_FTIR_FTIR_AFLP_SakacinP"
|
||||||
|
|
||||||
|
elseif datasetName == "HPLCforweb"
|
||||||
|
X = vars["HPLCforweb"]["data"]
|
||||||
|
elseif datasetName == "leukemia1"
|
||||||
|
X = vars["X"];
|
||||||
|
G = vars["G"];
|
||||||
|
elseif datasetName == "MALDI-TOF_melk_rep_6179_corr_exact"
|
||||||
|
|
||||||
|
elseif datasetName == "NIR_Raman_PUFA" # ONLY RAMAN FOR NOW!
|
||||||
|
X = vars["Ramandata"];
|
||||||
|
Y = vars["PUFAdata"];
|
||||||
|
waves = vars["Wavelengths_Raman"];
|
||||||
|
return X, Y, waves
|
||||||
|
elseif datasetName == "NMR_WineData"
|
||||||
|
X = vars["X"]
|
||||||
|
Y = vars["Y"];
|
||||||
|
ppm = vars["ppm"];
|
||||||
|
return X, Y, ppm
|
||||||
|
elseif datasetName == "onion_NMR"
|
||||||
|
X = vars["x"]
|
||||||
|
Y = vars["onion"];
|
||||||
|
ppm = vars["ppm"];
|
||||||
|
return X, Y, ppm
|
||||||
|
elseif datasetName == "OVALung"
|
||||||
|
X = vars["X"];
|
||||||
|
G = vars["y"];
|
||||||
|
return X, G
|
||||||
|
elseif datasetName == "Raman_Adipose_fat_pig_original"
|
||||||
|
X = vars["spectra"]
|
||||||
|
Y = vars["fat"]
|
||||||
|
baseline = vars["baseline"]
|
||||||
|
waves = parse.(Float64,vars["wavelength"])
|
||||||
|
return X, Y, waves, baseline
|
||||||
|
elseif datasetName == "Ramanmilk"
|
||||||
|
# There are replicates that need to be handled
|
||||||
|
#X = vars["SpectraR"]
|
||||||
|
#Y = vars["CLA"]
|
||||||
|
elseif datasetName == "RAMANPorkFat"
|
||||||
|
X = vars["X"]["data"]
|
||||||
|
Y = vars["Y"]["data"]
|
||||||
|
return X, Y
|
||||||
|
elseif datasetName == "spectra"
|
||||||
|
X = vars["NIR"];
|
||||||
|
Y = vars["octane"];
|
||||||
|
waves = 900:2:1700;
|
||||||
|
return X, Y, waves
|
||||||
|
elseif datasetName == "Sugar"
|
||||||
|
X = [vars["Xtrain"]; vars["Xtest"]]; Y = [vars["Ytrain"]; vars["Ytest"]];
|
||||||
|
waves = vars["wave"];
|
||||||
|
return X, Y, waves
|
||||||
|
elseif datasetName == "tumor11"
|
||||||
|
X = vars["X"];
|
||||||
|
G = vars["G"];
|
||||||
|
end
|
||||||
|
end
|
||||||
148
src/conveniencePlots.jl
Normal file
148
src/conveniencePlots.jl
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
### TO-DO: Add support for arbitrary number of plots
|
||||||
|
|
||||||
|
#=
|
||||||
|
X, _, waves = importData("Raman_Adipose_fat_plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);ig_original");
|
||||||
|
X_SNV = SNV(X);
|
||||||
|
X_EMSC = EMSC(X,6);
|
||||||
|
plotr(waves, X, X_EMSC);
|
||||||
|
plotr(waves, X);
|
||||||
|
=#
|
||||||
|
|
||||||
|
|
||||||
|
function plotr(waves, X::Matrix{Float64})
|
||||||
|
|
||||||
|
plt = plot(waves, X', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function plotr(waves, X1::Matrix{Float64}, X2::Matrix{Float64})
|
||||||
|
|
||||||
|
plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function plotr(waves, X)
|
||||||
|
|
||||||
|
plt = plot(waves, X["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function plotr(waves, X1, X2)
|
||||||
|
|
||||||
|
plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function plotr(waves, X1::Matrix{Float64}, X2)
|
||||||
|
|
||||||
|
plta = plot(waves, X1', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
pltb = plot(waves, X2["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function plotr(waves, X1, X2::Matrix{Float64})
|
||||||
|
|
||||||
|
plta = plot(waves, X1["X_Cor"]', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
pltb = plot(waves, X2', legend=false, xlabel=L"Wavenumber (cm$^{-1}$)", ylabel="Raman intensity (arb. units)", xflip=true, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1::Matrix{Float64}, X2::Matrix{Float64})
|
||||||
|
|
||||||
|
plta = plot(X1', legend=false, grid=false);
|
||||||
|
pltb = plot(X2', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1::Vector{Float64}, X2::Vector{Float64})
|
||||||
|
|
||||||
|
plta = plot(X1, legend=false, grid=false);
|
||||||
|
pltb = plot(X2, legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1, X2::Matrix{Float64})
|
||||||
|
|
||||||
|
plta = plot(X1["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltb = plot(X2', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1::Matrix{Float64}, X2)
|
||||||
|
|
||||||
|
plta = plot(X1', legend=false, grid=false);
|
||||||
|
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1, X2)
|
||||||
|
|
||||||
|
plta = plot(X1["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1, X2, X_3)
|
||||||
|
|
||||||
|
plta = plot(X1["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltc = plot(X3["X_Cor"]', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, pltc, layout = (3,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1::Matrix{Float64}, X2, X3)
|
||||||
|
|
||||||
|
plta = plot(X1', legend=false, grid=false);
|
||||||
|
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltc = plot(X3["X_Cor"]', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, pltc, layout = (3,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1, X2::Matrix{Float64}, X3)
|
||||||
|
|
||||||
|
plta = plot(X1["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltb = plot(X2', legend=false, grid=false);
|
||||||
|
pltc = plot(X3["X_Cor"]', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, pltc, layout = (3,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
|
|
||||||
|
function compPlot(X1, X2, X3::Matrix{Float64})
|
||||||
|
|
||||||
|
plta = plot(X1["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltb = plot(X2["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltc = plot(X3', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, pltc, layout = (3,1));
|
||||||
|
display(plt);
|
||||||
|
#return plt;
|
||||||
|
end
|
||||||
388
src/preprocessing.jl
Normal file
388
src/preprocessing.jl
Normal file
|
|
@ -0,0 +1,388 @@
|
||||||
|
#=
|
||||||
|
X, y = importData("Raman_Adipose_fat_pig_original"); # RAMANPorkFat
|
||||||
|
X_MSC = MSC(X);
|
||||||
|
X_MSCm = MSC(X, vec(mean(X, dims=1)));
|
||||||
|
X_SNV = SNV(X);
|
||||||
|
X_EMSC = EMSC(X, 6);
|
||||||
|
X_EMSCi = EMSC(X, 6, "svd", 1, -1, X[71,:]-X[39,:]); # Only for Raman_Adipose_fat_pig_original
|
||||||
|
X_EMSCb = EMSC(X, 6, "svd", 6, 0);
|
||||||
|
X_EMSCt = EMSCTraditional(X, 6);
|
||||||
|
X_base = findBaseline(X, 6);
|
||||||
|
X_SV1 = savitzkyGolay(X, 6, 5, 2, false);
|
||||||
|
|
||||||
|
# compPlot(X_EMSC, X_EMSCi); # or do the below
|
||||||
|
plta = plot(X_EMSC["X_Cor"]', legend=false, grid=false);
|
||||||
|
pltb = plot(X_EMSCi["X_Cor"]', legend=false, grid=false);
|
||||||
|
plt = plot(plta, pltb, layout = (2,1));
|
||||||
|
display(plt);
|
||||||
|
=#
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function plegendre(d, p)
|
||||||
|
|
||||||
|
Calculates orthonormal Legendre polynomials using a QR factorisation.
|
||||||
|
Inputs:
|
||||||
|
- d : polynomial degree
|
||||||
|
- p : size of vector
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
- Q : (d+1) x p matrix with basis
|
||||||
|
- R : matrix from QR-factorisation
|
||||||
|
"""
|
||||||
|
|
||||||
|
function plegendre(d, p)
|
||||||
|
|
||||||
|
P = ones(p, d+1);
|
||||||
|
x = (-1:2/(p-1):1)';
|
||||||
|
for k in 1:d
|
||||||
|
P[:,k+1] = x.^k;
|
||||||
|
end
|
||||||
|
|
||||||
|
factorisation = qr(P);
|
||||||
|
Q = Matrix(factorisation.Q)';
|
||||||
|
R = Matrix(factorisation.R);
|
||||||
|
|
||||||
|
return Q, R
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function SNV(X)
|
||||||
|
|
||||||
|
Standard Normal Variate (SNV) preprocessing (for each sample: subtract mean, divide by standard deviation)
|
||||||
|
Returns dictionary with keys X\\_Cor, means, stds.
|
||||||
|
"""
|
||||||
|
function SNV(X)
|
||||||
|
|
||||||
|
X_SNV = zeros(size(X));
|
||||||
|
means = mean(X, dims=2);
|
||||||
|
stds = std(X, dims=2);
|
||||||
|
X_SNV = @. (X - means) / stds;
|
||||||
|
|
||||||
|
return_values = Dict([("X_Cor", X_SNV), ("means", means), ("stds", stds)]);
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function MSC(X, ref::String="mean"||Vector{Float64})
|
||||||
|
|
||||||
|
MSC preprocessing (subtract constant trend, scale based on projection onto ref. spectrum)
|
||||||
|
Second argument is mean (default) or svd (first right singular vector), or a vector to be used as reference spectrum
|
||||||
|
Returns dictionary with keys X\\_Cor, X\\_Ref, coeffs
|
||||||
|
"""
|
||||||
|
function MSC(X, ref::String="mean")
|
||||||
|
|
||||||
|
if ref == "mean"
|
||||||
|
X_Ref = vec(mean(X, dims=1));
|
||||||
|
elseif ref == "svd"
|
||||||
|
u, s, v = svd(X, full=false);
|
||||||
|
X_Ref = v[:,1];
|
||||||
|
if X_Ref[1] < 0 # Ad-hoc, attempts to correct for "flipped" spectra
|
||||||
|
X_Ref = -X_Ref;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
B = [ones(size(X,2), 1) X_Ref];
|
||||||
|
coeffs = B \ X';
|
||||||
|
X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:];
|
||||||
|
|
||||||
|
return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function MSC(X, X_Ref::Vector{Float64})
|
||||||
|
|
||||||
|
B = [ones(size(X,2), 1) X_Ref];
|
||||||
|
coeffs = B \ X';
|
||||||
|
X_MSC = @. (X - coeffs[1,:]) / coeffs[2,:];
|
||||||
|
|
||||||
|
return_values = Dict([("X_Cor", X_MSC), ("X_Ref", X_Ref), ("coeffs", coeffs)])
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function EMSCTraditional(X, polDeg=2)
|
||||||
|
|
||||||
|
EMSC correction with mean spectrum as reference and polynomial trends of form LinRange(-1,1,p).^i.
|
||||||
|
First argument is spectra to be corrected, second argument is either degree for polynomial regression
|
||||||
|
or a basis to be used for correction (basis used should be output of this function, the reference
|
||||||
|
spectrum is assumed to be the first basis vector).
|
||||||
|
Returns dictionary with keys X\\_Cor, basis, coeffs
|
||||||
|
"""
|
||||||
|
function EMSCTraditional(X, polDeg::Int64=2)
|
||||||
|
|
||||||
|
n, p = size(X);
|
||||||
|
P = zeros(p, polDeg+1);
|
||||||
|
[P[:,i] = LinRange(-1, 1, p).^(i-1) for i in 1:polDeg+1];
|
||||||
|
refSpec = mean(X, dims=1)';
|
||||||
|
basis = [refSpec P];
|
||||||
|
|
||||||
|
return_values = EMSCTraditional(X, basis);
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function EMSCTraditionalCorrection(X, basis)
|
||||||
|
|
||||||
|
EMSC correction according to the given input basis that should be
|
||||||
|
the output of the EMSCTraditional function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
function EMSCTraditional(X, basis::Matrix{Float64})
|
||||||
|
|
||||||
|
coeffs = basis \ X';
|
||||||
|
X_Cor = (X - coeffs[2:end,:]' * basis[:,2:end]') ./ coeffs[1,:];
|
||||||
|
|
||||||
|
return_values = Dict([("X_Cor", X_Cor), ("basis", basis), ("coeffs", coeffs)]);
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
|
||||||
|
|
||||||
|
SavitzkyGolay filter.
|
||||||
|
Inputs:
|
||||||
|
X - data matrix
|
||||||
|
d - polynomial order
|
||||||
|
w - window size parameter (window size = 2*w+1)
|
||||||
|
der_order - derivative order
|
||||||
|
keep_endpoints - boolean, truncates spectra if false. NOTE: For derivatives MUST USE FALSE
|
||||||
|
|
||||||
|
Output: Dictionary with keys X\\_Cor, der\\_order, degree, window\\_size, filter\\_coeffs, keep\\_endpoints
|
||||||
|
"""
|
||||||
|
function savitzkyGolay(X, d=2, w=2, der_order=0, keep_endpoints=false)
|
||||||
|
|
||||||
|
if (d > 2*w+1) | (der_order > d)
|
||||||
|
error("Error! Must have d>2*w+1 and der_order > d")
|
||||||
|
end
|
||||||
|
|
||||||
|
n, p = size(X);
|
||||||
|
Q, R = plegendre(d, 2*w+1);
|
||||||
|
a = factorial(der_order) * (R \ Q)[der_order + 1,:];
|
||||||
|
X_Cor = zeros(n, p);
|
||||||
|
X_Cor[:, 1:w] = X[:, 1:w];
|
||||||
|
X_Cor[:, end-w:end] = X[:, end-w:end];
|
||||||
|
|
||||||
|
for i in 1:n
|
||||||
|
for j in 1:p-2*w-1
|
||||||
|
X_Cor[i,j+w] = X[i, j : j + 2*w]'a;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if !keep_endpoints
|
||||||
|
X_Cor = X_Cor[:, w+1:end];
|
||||||
|
X_Cor = X_Cor[:, 1:end-w-1];
|
||||||
|
end
|
||||||
|
|
||||||
|
return Dict([("X_Cor", X_Cor), ("der_order", der_order), ("degree", d), ("window_size", w), ("filter_coeffs", a), ("keep_endpoints", keep_endpoints)]);
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
|
||||||
|
function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
|
||||||
|
|
||||||
|
Finding and correcting baseline using iterative polynomial fits
|
||||||
|
Implementation of baseline correcting algorithm in Automated Method for
|
||||||
|
Subtraction of Fluorescence from Biological Raman Spectra (Lieber and
|
||||||
|
Mahadevan-Jansen) as described in Optimal choice of baseline correction
|
||||||
|
for multivariate calibration of spectra (Liland and Mevik).
|
||||||
|
NOTE: Changed criterion to while loop to relative error and lowered threshold considerably compared to MATLAB code.
|
||||||
|
The first function finds the baseline, the second functions calls the first function repeatedly on all rows in X.
|
||||||
|
"""
|
||||||
|
function findBaseline(v::Vector{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
|
||||||
|
p = length(v);
|
||||||
|
P, _ = plegendre(polDeg, p);
|
||||||
|
prevCoeffs = zeros(polDeg+1);
|
||||||
|
newCoeffs = ones(polDeg+1);
|
||||||
|
vcur = v;
|
||||||
|
vnew = copy(v);
|
||||||
|
nIter = 0
|
||||||
|
|
||||||
|
while (norm(P' * (prevCoeffs-newCoeffs))/norm(P'*prevCoeffs) > thresh) && (nIter < maxIter)
|
||||||
|
|
||||||
|
prevCoeffs = newCoeffs;
|
||||||
|
newCoeffs = P' \ vnew;
|
||||||
|
vnew = P' * newCoeffs;
|
||||||
|
|
||||||
|
for i in 1:p
|
||||||
|
if vcur[i] < vnew[i]
|
||||||
|
vnew[i] = vcur[i];
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
nIter = nIter + 1;
|
||||||
|
end
|
||||||
|
|
||||||
|
if nIter == maxIter
|
||||||
|
println("Warning! Reached maximum number of iterations!");
|
||||||
|
end
|
||||||
|
|
||||||
|
baseline = vec(newCoeffs' * P);
|
||||||
|
coeffs = newCoeffs';
|
||||||
|
v_cor = v - baseline;
|
||||||
|
|
||||||
|
return v_cor, baseline, coeffs
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function findBaseline(X::Matrix{Float64}, polDeg=2, maxIter=100, thresh=1e-2)
|
||||||
|
|
||||||
|
n, p = size(X);
|
||||||
|
X_Cor = zeros(n, p);
|
||||||
|
baseline = zeros(n,p);
|
||||||
|
coeffs = zeros(n, polDeg+1);
|
||||||
|
|
||||||
|
for i=1:n
|
||||||
|
_, baseline[i,:], coeffs[i,:] = findBaseline(X[i,:], polDeg, maxIter, thresh);
|
||||||
|
X_Cor[i,:] = X[i,:] - baseline[i,:];
|
||||||
|
end
|
||||||
|
|
||||||
|
return_values = Dict([("baseline", baseline), ("X_Cor", X_Cor)]);
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
function EMSC(X, polDeg=2, refType="svd", nRef=1, baseDeg=-1, intF=0)
|
||||||
|
|
||||||
|
Implementation of EMSC pre-processing based on Skogholt et al. (2018).
|
||||||
|
intF is 0 (no interferent), vector, or matrix with intereferents as rows.
|
||||||
|
Returns dictionary with keys X\\_Cor, model, coeffs
|
||||||
|
"""
|
||||||
|
function EMSC(X, polDeg=2, refType="svd", nRef=1, baseDeg=-1, intF=0)
|
||||||
|
|
||||||
|
if intF == 0
|
||||||
|
nIntF = 0;
|
||||||
|
elseif typeof(intF)==Vector{Float64}
|
||||||
|
intF = reshape(intF, 1, length(intF));
|
||||||
|
nIntF = 1;
|
||||||
|
else
|
||||||
|
nIntF = size(intF, 1);
|
||||||
|
end
|
||||||
|
|
||||||
|
n, p = size(X);
|
||||||
|
P, _ = plegendre(polDeg, p);
|
||||||
|
refSpec = zeros(nRef, p);
|
||||||
|
tot = polDeg + 1 + nIntF;
|
||||||
|
|
||||||
|
if refType == "svd"
|
||||||
|
_, _, v = svd(X, full=false);
|
||||||
|
refSpec = v[:,1:nRef];
|
||||||
|
elseif refType == "mean"
|
||||||
|
refSpec = mean(X, dims=1)';
|
||||||
|
end
|
||||||
|
|
||||||
|
# Correcting sign if ref. vectors are flipped. Ad-hoc but appears to work
|
||||||
|
# well in practice.
|
||||||
|
correlations = cor(X', refSpec);
|
||||||
|
for i in 1:nRef
|
||||||
|
if sum(correlations[:,i] .< 0) > n/2
|
||||||
|
refSpec[i,:] = -refSpec[i,:];
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Creating basis, need ref. spectra at the end to correct for interferents
|
||||||
|
# and polynoial trends.
|
||||||
|
if nIntF == 0
|
||||||
|
basis, R = qr([P' refSpec]);
|
||||||
|
basis = Matrix(basis)';
|
||||||
|
else
|
||||||
|
basis, R = qr([P' intF' refSpec]);
|
||||||
|
basis = Matrix(basis)';
|
||||||
|
end
|
||||||
|
|
||||||
|
# Checking if QR factorisation "flipped" any of the vectors and if so flip them back.
|
||||||
|
for i in (tot+1):(tot+nRef)
|
||||||
|
if R[i,i] < 0
|
||||||
|
R[i,:] = -R[i,:];
|
||||||
|
basis[i,:] = -basis[i,:];
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
refPolPol = vec( R[1:tot, tot+1]' * basis[1:tot,:] / R[tot+1, tot+1]);
|
||||||
|
|
||||||
|
# Optional baseline correction. Baseline calculated on (first) reference spectrum and same baseline is subtracted
|
||||||
|
# from all spectra (to not affect prediction).
|
||||||
|
if baseDeg > -1
|
||||||
|
_, baseline = findBaseline(refSpec[:,1], baseDeg, 1000, 1e-4);
|
||||||
|
else
|
||||||
|
baseline = zeros(p);
|
||||||
|
end
|
||||||
|
|
||||||
|
model = EMSCModel(basis, R, Vector([polDeg+1, nIntF, nRef]), refPolPol, baseline);
|
||||||
|
return_values = EMSCCorrection(X, model);
|
||||||
|
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
- EMSCCorrection(X, model::EMSCModel)
|
||||||
|
|
||||||
|
Performs EMSC correction on X with given model.
|
||||||
|
Returns dictionary with keys X\\_Cor, model, coeffs
|
||||||
|
"""
|
||||||
|
function EMSCCorrection(X, model::EMSCModel)
|
||||||
|
|
||||||
|
n, _ = size(X);
|
||||||
|
|
||||||
|
# Calculate coefficients (utilizing orthogonl basis),
|
||||||
|
# Subtract projection onto vectors modelling unwanted additive stuff,
|
||||||
|
# Normalise L2-norm of spectra in subspace spanned by reference spectrum
|
||||||
|
tot = sum(model.basisExplanation[1:2]);
|
||||||
|
coeffs = X * model.basis';
|
||||||
|
mult = sqrt.(sum(coeffs[:, tot+1:end].^2, dims=2));
|
||||||
|
X_Cor = X - coeffs[:,1:tot] * model.basis[1:tot,:];
|
||||||
|
X_Cor = X_Cor ./ mult;
|
||||||
|
|
||||||
|
# Below is purely for visualisation.
|
||||||
|
# The spectra look strange without polynomial trends,
|
||||||
|
# so what is done below is adding back the polynomial trends to the reference spectrum.
|
||||||
|
# This is done by using the fact that R[1:tot, tot+1] contains the projection of the (first)
|
||||||
|
# reference spectrum onto the previous basis vectors. This allows us to easily add back the polynomial trends.
|
||||||
|
# As the same vector is added to each spectrum predictions ar eunaffected.
|
||||||
|
[X_Cor[i,:] = X_Cor[i,:] - model.refPolPol for i in 1:n];
|
||||||
|
[X_Cor[i,:] = X_Cor[i,:] - model.baseline for i in 1:n]; # Surely this should be possible with broadcasting instead...
|
||||||
|
# Thought it would be + baseline above, but this works...
|
||||||
|
|
||||||
|
return_values = Dict([("X_Cor", X_Cor), ("model", model), ("coeffs", coeffs)]);
|
||||||
|
return return_values
|
||||||
|
end
|
||||||
Loading…
Add table
Reference in a new issue