include("scripts/pca_demo_helpers.jl")
X = readDataSet("datasets/virus3.dat")

using LinearAlgebra
include("scripts/pca_demo_helpers.jl")
X = readDataSet("datasets/virus3.dat")

(θ, Z) = pPCA(convert(Matrix,X'), 2)# uses EM, implemented in scripts/pca_demo_helpers.jl. Feel free to try more/less dimensions.

using PyPlot
plot(Z[1,:], Z[2,:], "w")
for n=1:size(Z,2)
    PyPlot.text(Z[1,n], Z[2,n], string(n), fontsize=10) # put a label on the position of the data point
end
title("Projection of Tobamovirus data set on two dimensions (numbers correspond to data points)", fontsize=10);

X_corrupt = convert(Matrix{Float64}, X)# convert to floating point matrix so we can use NaN to indicate missing values
indices = findall(rand(Float64,size(X)) .< 0.2)
X_corrupt[indices] .= NaN
println(X_corrupt)

(θ, Z) = pPCA(convert(Matrix,X_corrupt'), 2) # Perform pPCA on the corrupted data set

plot(Z[1,:], Z[2,:], "w")
for n=1:size(Z,2)
    PyPlot.text(Z[1,n], Z[2,n], string(n), fontsize=10) # put a label on the position of the data point
end
title("Projection of CORRUPTED Tobamovirus data set on two dimensions", fontsize=10);