Images.label_components
to label connected pixels (blobs)Some features:
I want to thank:
using Images, ImageDraw
using Interact, BenchmarkTools
sampledir = "/Users/ken/Coding/astrohack/Astrohack/Sample_Data/SAMPLE/"
files = readdir(sampledir)
ids = String[first(split(f, "-")) for f in files if f[end-4]=='g'];
id = ids[27]
@time img_raw = readcsv(joinpath(sampledir, id * "-g.csv"), Float32)
img_gray = colorview(Gray, img_raw)
1.183841 seconds (2.03 M allocations: 70.628 MB, 1.49% gc time)
img_peaks = copy(img_raw)
cutoff = min(1, img_raw[fld.(size(img_raw), 2)...])
img_peaks[img_peaks .> 1] = 0
img = colorview(RGB, img_peaks,img_peaks,img_raw)
summary(img)
"151×151 ColorView{RGB}(::ImageCore.StackedView{Float32,3,Tuple{Array{Float32,2},Array{Float32,2},Array{Float32,2}}}) with element type ColorTypes.RGB{Float32}"
img_bin = falses(img_raw)
img_smooth = imfilter(img_raw, Kernel.gaussian(2))
bin_th = 0.05
img_bin[img_smooth .> bin_th] = true
colorview(Gray, img_bin)
labs = label_components(img_bin);
centre = fld.(size(labs), 2)
labs_blob = zeros(img_raw)
labs_ind = labs .== labs[centre...]
labs_blob[labs_ind] = 0.5
0.5
img_labs = colorview(RGB, labs_blob,img_peaks,copy(img_raw))
# find box around center that contains the galaxy based on the center label.
#It's robust by using the galaxy symmetry so we take minimum of opposite directions
@everywhere function findbox(inds)
ci, cj = fld.(size(inds), 2)
north = east = south = west = 0
for j = 1:size(inds, 2)
for i = 1:size(inds, 1)
if inds[i,j]
if i < ci #north
ci - i > north && (north = ci - i)
else #south
i - ci > south && (south = i - ci)
end
if j < cj
cj - j > west && (west = cj - j)
else
j - cj > east && (east = j - cj)
end
end
end
end
min_width = min(east, west)
min_heigth = min(north, south)
return min_width, min_heigth
end
findbox(labs_ind)
(24,24)
@benchmark findbox($labs_ind) #it's fast, thanks Julia!
BenchmarkTools.Trial: memory estimate: 0 bytes allocs estimate: 0 -------------- minimum time: 291.487 μs (0.00% GC) median time: 321.457 μs (0.00% GC) mean time: 339.062 μs (0.00% GC) maximum time: 1.559 ms (0.00% GC) -------------- samples: 10000 evals/sample: 1
# draw the box on the image
function drawcbox!(img, w, h) #width, height
ci, cj = fld.(size(img), 2)
T = eltype(img)
# left ,right
for i = ci-h:ci+h
img[i, cj-w] = one(T)
img[i, cj+w] = one(T)
end
# up,down
for j = cj-w:cj+w
img[ci+h, j] = one(T)
img[ci-h, j] = one(T)
end
img
end
drawcbox! (generic function with 1 method)
drawcbox!(img_labs, findbox(labs_ind)...)
function idplot(i, binth=0.05)
sampledir = "/Users/ken/Coding/astrohack/Astrohack/Sample_Data/SAMPLE/"
img_raw = readcsv(joinpath(sampledir, ids[i]*"-g.csv"))#, Float32)
ci, cj = fld.(size(img_raw), 2)
img_peaks = copy(img_raw)
cutoff = min(1, img_raw[ci, cj])
img_peaks[img_peaks .> cutoff] = 0
img_bin = falses(img_raw)
img_smooth = imfilter(img_raw, Kernel.gaussian(2))
img_bin[img_smooth .> binth] = true
labels = label_components(img_bin)
img_cblob = zeros(img_raw)
labs_ind = labels .== labels[ci, cj]
img_cblob[labs_ind] = 0.5
img_labs = colorview(RGB, img_cblob, img_peaks, img_raw)
#img_circles = copy(img_labs)
blobs = blob_LoG(img_raw, 1:4) #TODO issue for float32?
blobth = 0.1
bigblobs = [blob for blob in blobs if blob.amplitude > blobth]
for blob in bigblobs
circle = CirclePointRadius(blob.location, sqrt(2)*blob.σ)
draw!(img_labs, circle, RGB(one(eltype(img_raw)),0,0))
end
drawcbox!(img_labs, findbox(labs_ind)...)
end
WARNING: Method definition idplot(Any) in module Main at In[17]:2 overwritten at In[18]:2. WARNING: Method definition idplot(Any, Any) in module Main at In[17]:2 overwritten at In[18]:2.
idplot (generic function with 2 methods)
@manipulate for i = 1:length(ids),
binth = 0.01:0.005:0.08
idplot(i, binth)
end
(= only entropy and symmetrie)
label_components
to find central galaxy.Then run good ol' XGBoost on features
imROF
for denoisingci, cj = fld.(size(img_raw), 2)
BOXMIN = 10
cwidth, cheight = max(findbox(labs_ind), (BOXMIN,BOXMIN))
img_crop = img_raw[ci-cheight:ci+cheight, cj-cwidth:cj+cwidth]
colorview(Gray, img_crop)
img_onlycblob = copy(img_raw)
img_onlycblob[!labs_ind] = 0
boxsize = max(cwidth, cheight)
img_box = img_raw[ci-boxsize:ci+boxsize, cj-boxsize:cj+boxsize]
colorview(Gray, img_box)
@time reseigs = eigs(img_box, nev=10)
norm.(reseigs[1]).^2
0.002158 seconds (738 allocations: 61.313 KB)
10-element Array{Float32,1}: 645.704 17.6002 0.942475 0.887299 0.0973208 0.093082 0.0130057 0.0130057 0.0113171 0.0113171
@time reseigs = eigs(img_crop'*img_crop, nev=10)
norm.(reseigs[1])
0.001327 seconds (286 allocations: 31.688 KB)
10-element Array{Float32,1}: 647.59 17.7356 1.0503 0.901244 0.144213 0.129747 0.0570758 0.0509714 0.0372561 0.0361984
entropy(img_crop)
4.054558771384226
noise = img_crop - imfilter(img_crop, Kernel.gaussian(1))
maximum(noise)
0.5881191265860775
entropy(img_crop - imfilter(img_crop, Kernel.gaussian(2)))
2.7846815517236028
using StatsBase
addprocs();
@everywhere using Images
traindir = "/Users/ken/Coding/astrohack/Train/"
trainlist = readdir(traindir);
# remove i-pics once
# i_files = filter(x->x[end-4]=='i', trainlist)
# cd(traindir)
# for file in i_files
# rm(file)
# end
using DataFrames
trainmeta = readtable("/Users/ken/Coding/astrohack/Train.csv",separator=';')
sort!(trainmeta) #sort on ID to match directory listing!
SDSS_ID | logMstar | err_logMstar | Distance | |
---|---|---|---|---|
1 | 1237645879578460255 | 11.100999999999999 | 0.028999999999999998 | 518.0174523561253 |
2 | 1237645941824356443 | 10.539000000000001 | 0.032 | 172.10307774074076 |
3 | 1237645943974396134 | 11.318 | 0.015 | 654.2479282849002 |
4 | 1237645943975837722 | 10.862 | 0.026000000000000002 | 285.27259536182333 |
5 | 1237645943978328289 | 9.485 | 0.042 | 165.69725598860398 |
6 | 1237645943978393694 | 10.755999999999998 | 0.028999999999999998 | 173.81129687464386 |
7 | 1237645943978983579 | 10.168 | 0.024 | 207.54862476923074 |
8 | 1237645943978983683 | 11.792 | 0.017 | 794.3218972649573 |
9 | 1237645943979114622 | 10.932 | 0.044000000000000004 | 168.25958468945865 |
10 | 1237645943979507954 | 10.82 | 0.012 | 158.43732466951565 |
11 | 1237645943979638945 | 10.09 | 0.04 | 163.56198207122506 |
12 | 1237646796526715482 | 10.455 | 0.033 | 222.49554219088319 |
13 | 1237646797600522764 | 11.238 | 0.013999999999999999 | 390.32807209686604 |
14 | 1237648672921813985 | 9.585 | 0.057999999999999996 | 187.90410472934474 |
15 | 1237648672922337955 | 10.74 | 0.031 | 196.0181456153846 |
16 | 1237648673458684355 | 10.677999999999999 | 0.028999999999999998 | 215.66266565527064 |
17 | 1237648673458881081 | 10.315 | 0.032 | 253.67054138461535 |
18 | 1237648673459077169 | 11.09 | 0.011000000000000001 | 252.38937703418802 |
19 | 1237648673459274344 | 11.027999999999999 | 0.022000000000000002 | 400.5773869002848 |
20 | 1237648673459339331 | 11.205 | 0.027999999999999997 | 399.72327733333333 |
21 | 1237648673459667002 | 9.934 | 0.043 | 190.03937864672363 |
22 | 1237648673459667234 | 10.974 | 0.024 | 197.29930996581194 |
23 | 1237648673460585255 | 10.957 | 0.016 | 299.79245799999995 |
24 | 1237648673971437623 | 10.227 | 0.015 | 53.38184793447294 |
25 | 1237648673995162093 | 10.449000000000002 | 0.032 | 131.5328733105413 |
26 | 1237648673995686549 | 11.024000000000001 | 0.019 | 338.22738851282054 |
27 | 1237648673995948107 | 10.299000000000001 | 0.045 | 116.58595588888888 |
28 | 1237648673996997574 | 10.603 | 0.048 | 252.8164318176638 |
29 | 1237648673997127724 | 10.638 | 0.026000000000000002 | 187.90410472934474 |
30 | 1237648674510995594 | 11.522 | 0.013999999999999999 | 289.5431431965812 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
size(trainmeta,1) == length(trainlist)
true
@everywhere function process(file)
BINTH = 0.05 # chosen by eye
BOXMIN = 10 # minimum crop of 100 pixels
T = Float32 # typical premature optimization
img_raw = readcsv(file, T)
ci, cj = fld.(size(img_raw), 2)
# get percentiles for noise levels
img_vec = reshape(img_raw, length(img_raw))
p1 = StatsBase.percentile(img_vec, 1)
p2 = StatsBase.percentile(img_vec, 5)
# select the galaxy by thresholding smoothed image and labeling blobs
img_smooth = imfilter(T, img_raw, Kernel.gaussian(2))
img_bin = img_smooth .> BINTH
labels = label_components(img_bin)
labs_ind = labels .== labels[ci, cj]
# crop galaxy
cwidth, cheight = max(findbox(labs_ind), (BOXMIN,BOXMIN))
crop_ind = ci-cheight:ci+cheight, cj-cwidth:cj+cwidth
img_crop = img_raw[crop_ind...]
#Crudely set non galaxy values in crop to zero to remove possible other galaxy
labels_ind_crop = labs_ind[crop_ind...]
img_crop[!labels_ind_crop] = 0
# get top eigenvalues (spectra) for symmetry features
reseigs = eigs(img_crop'*img_crop, nev=10)
spect = norm.(reseigs[1])
# get sum of intensities and (squared) noise
s1 = sum(img_crop)
img_crop_smooth = view(img_smooth, crop_ind...)
s2 = sum(img_crop - img_crop_smooth)
s3 = sum((img_crop - img_crop_smooth).^2)
s4 = sum(labs_ind) #number of galaxy pixels
s5 = norm(img_crop) #why not..
# get entropy
e1 = Images.entropy(img_crop)
e2 = Images.entropy(img_crop - imfilter(T, img_crop, Kernel.gaussian(2)))
return T[p1,p2, s1, s2, s3, s4, s5, e1, e2, spect...]
end
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3. WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3. WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3. WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3. WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3. WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3. WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3. WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3WARNING: Method definition process. (Any) in module Main at In[189]:3 overwritten at In[191]:3.
# test the function
@time process(joinpath(traindir, trainlist[10]))
0.011020 seconds (35.67 k allocations: 2.102 MB)
19-element Array{Float32,1}: -0.019007 -0.0117548 309.332 -11.8464 39.1035 1283.0 19.6011 4.088 3.27034 384.204 8.92826 0.288739 0.174709 0.0511862 0.0380456 0.0332355 0.0284405 0.0271679 0.0230588
# test on other worker thread
@time remotecall_fetch(process, 2, joinpath(traindir, trainlist[10]))
0.009558 seconds (162 allocations: 8.563 KB)
19-element Array{Float32,1}: -0.019007 -0.0117548 309.332 -11.8464 39.1035 1283.0 19.6011 4.088 3.27034 384.204 8.92826 0.288739 0.174709 0.0511882 0.0380461 0.0332357 0.0284385 0.0271685 0.0230591
# @time begin
# N = 100
# fs = zeros(N, 15)
# fs[:, 1] = trainmeta[:Distance][1:N]
# @sync @parallel for i=1:N
# file = joinpath(traindir, trainlist[i])
# fs[i, 2:end] = process(file)
# end
# end
# process all train images in parallel and extract features
# longest step, about 6 minutes on my 2015 macbook pro
@time resmap = pmap(process, String[joinpath(traindir, f) for f in trainlist]);
387.255984 seconds (11.21 M allocations: 448.294 MB, 0.18% gc time)
feats = zeros(eltype(resmap[1]), length(resmap), 1+length(resmap[1]))
feats[:, 1] = trainmeta[:Distance]
for i = 1:length(resmap)
feats[i, 2:end] = resmap[i]
end
values = convert(Array{Float32}, trainmeta[:logMstar]);
XGBoost has a known compile issue on mac because clang does not come with multithreaded OpenMP installed.
So I installed gcc with homebrew brew install gcc --without-multilib
, but you still have to link gcc-6 and g++-6 locally!
I installed Pkg.add("XGBoost")
and in the deps
folder I copied the config.mk to main xgboost folder and changed the gcc and g++ to local /usr/local/bin/gcc-6 paths.
using XGBoost
sum(values .== -99) #missing values!
1374
# with removing missing values (god damnit)
nrows = length(resmap) - sum(values.==-99)
T = eltype(resmap[1])
feats2 = zeros(T, nrows, size(feats, 2))
values2 = zeros(T, nrows)
err2 = zeros(T, nrows)
currenti = 1
for i = 1:length(resmap)
if values[i] != -99
feats2[currenti, 1] = trainmeta[:Distance][i]
feats2[currenti, 2:end] = resmap[i]
values2[currenti] = values[i]
err2[currenti] = trainmeta[:err_logMstar][i]
currenti += 1
end
end
size(feats2)
(74891,20)
using Plots
gr()
Plots.GRBackend()
histogram(values2)
rand_ids = shuffle(1:length(values2));
train_n = length(values2) - 1734
train_ids = rand_ids[1:train_n]
#train_sample = sample(1:length(feats), train_n, replace=false)
train_x = feats2[train_ids, :]
train_y = values2[train_ids]
test_n = 1734
test_ids= rand_ids[train_n:train_n+test_n]
test_x = feats2[test_ids, :]
test_y = values2[test_ids]
test_err= err2[test_ids];
@time boost = xgboost(train_x, 50, label=train_y, max_depth=50, learning_rate=0.3)
[1] train-rmse:7.053658 [2] train-rmse:4.942040 [3] train-rmse:3.464624 [4] train-rmse:2.431568 [5] train-rmse:1.710097 [6] train-rmse:1.207397 [7] train-rmse:0.858015 [8] train-rmse:0.615704 [9] train-rmse:0.447069 [10] train-rmse:0.328264 [11] train-rmse:0.243532 [12] train-rmse:0.182273 [13] train-rmse:0.137520 [14] train-rmse:0.104366 [15] train-rmse:0.079667 [16] train-rmse:0.061113 [17] train-rmse:0.047157 [18] train-rmse:0.036617 [19] train-rmse:0.028589 [20] train-rmse:0.022415 [21] train-rmse:0.017671 [22] train-rmse:0.014006 [23] train-rmse:0.011178 [24] train-rmse:0.008958 [25] train-rmse:0.007220 [26] train-rmse:0.005842 [27] train-rmse:0.004757 [28] train-rmse:0.003882 [29] train-rmse:0.003181 [30] train-rmse:0.002616 [31] train-rmse:0.002161 [32] train-rmse:0.001797 [33] train-rmse:0.001501 [34] train-rmse:0.001260 [35] train-rmse:0.001060 [36] train-rmse:0.000909 [37] train-rmse:0.000785 [38] train-rmse:0.000686 [39] train-rmse:0.000604 [40] train-rmse:0.000551 [41] train-rmse:0.000512 [42] train-rmse:0.000505 [43] train-rmse:0.000468 [44] train-rmse:0.000467
15.151809 seconds (7.82 k allocations: 6.371 MB)
[45] train-rmse:0.000464 [46] train-rmse:0.000464 [47] train-rmse:0.000464 [48] train-rmse:0.000464 [49] train-rmse:0.000464 [50] train-rmse:0.000464
XGBoost.Booster(Ptr{Void} @0x00007f9bcb14c100)
test_pred = XGBoost.predict(boost, test_x);
hcat(test_y, test_pred, test_err)
1735×3 Array{Float32,2}: 11.182 11.182 0.009 11.51 11.5777 0.026 10.907 10.8396 0.037 10.59 10.5571 0.041 11.42 11.2049 0.031 10.256 10.011 0.024 11.171 10.6016 0.025 9.349 9.34016 0.055 10.296 10.2444 0.014 10.941 10.942 0.01 10.71 10.7143 0.042 11.307 11.2312 0.034 9.463 9.29443 0.045 ⋮ 9.971 10.0718 0.03 10.808 10.8301 0.022 10.317 10.4416 0.056 9.23 9.22179 0.056 9.746 9.84914 0.033 11.514 11.4574 0.029 10.371 10.3303 0.021 10.349 10.3235 0.033 10.192 9.96918 0.063 10.101 10.2186 0.028 10.792 11.0878 0.023 10.557 10.4991 0.034
sum((test_y - test_pred).^2 ./ (test_err.^2))
133952.8f0
sub_ids = readcsv("validationdata_SDSSID.csv", Int, header=false)
# sort!
sub_ids = sort!(reshape(sub_ids, length(sub_ids)))
sub_dir = "/Users/ken/Coding/astrohack/Test/"
sub_files = [joinpath(sub_dir, string(id) * "-g.csv") for id in sub_ids];
length(sub_ids)
1734
KERNEL EXCEPTION BoundsError: attempt to access 13-element Array{UInt8,1} at index [-1] in next at ./strings/string.jl:92 [inlined] in getindex(::String, ::Int64) at ./strings/basic.jl:70 in ind2chr(::String, ::Int64) at ./strings/basic.jl:227 in complete_request(::ZMQ.Socket, ::IJulia.Msg) at /Users/ken/.julia/v0.5/IJulia/src/handlers.jl:40 in eventloop(::ZMQ.Socket) at /Users/ken/.julia/v0.5/IJulia/src/eventloop.jl:8 in (::IJulia.##13#19)() at ./task.jl:360
testfiles = readdir(sub_dir);
# remove i-pics once
# i_files = filter(x->x[end-4]=='i', testfiles)
# cd(sub_dir)
# for file in i_files
# rm(file)
# end
@time subresmap = pmap(process, sub_files);
7.182995 seconds (234.93 k allocations: 9.468 MB)
submeta = readtable("/Users/ken/Coding/astrohack/Test_Distance.csv",separator=';')
sort!(submeta)
SDSS_ID | Distance | |
---|---|---|
1 | 1237645943978590386 | 160.57259858689457 |
2 | 1237645943979114582 | 169.9678038233618 |
3 | 1237645943979311221 | 152.88561248433047 |
4 | 1237648672922468973 | 198.58047431623933 |
5 | 1237648673992671592 | 232.3178022108262 |
6 | 1237648674529476993 | 137.08458549572646 |
7 | 1237648674532753564 | 371.5376616239317 |
8 | 1237648675603874096 | 140.07396898005695 |
9 | 1237648702972887077 | 367.6941685726495 |
10 | 1237648702978654353 | 212.67328217094013 |
11 | 1237648702979244208 | 231.03663786039883 |
12 | 1237648702983897351 | 287.83492406267806 |
13 | 1237648702985797942 | 299.7924579999999 |
14 | 1237648702986125622 | 518.0174523561253 |
15 | 1237648702992023572 | 144.7715715982906 |
16 | 1237648703503794279 | 300.64656756695155 |
17 | 1237648703504515190 | 108.0448602193732 |
18 | 1237648703505432738 | 102.49314803418802 |
19 | 1237648703505629237 | 477.4472479259259 |
20 | 1237648703508119586 | 98.64965498290596 |
21 | 1237648703509758235 | 366.41300422222224 |
22 | 1237648703512772638 | 374.9540998917379 |
23 | 1237648703516508315 | 510.33046625356116 |
24 | 1237648703527649617 | 102.9202028176638 |
25 | 1237648703531123239 | 396.7338938490028 |
26 | 1237648704045908141 | 132.38698287749284 |
27 | 1237648704046563336 | 54.66301228490028 |
28 | 1237648704054231257 | 222.06848740740742 |
29 | 1237648704055017848 | 149.46917421652418 |
30 | 1237648704057573661 | 184.48766646153842 |
⋮ | ⋮ | ⋮ |
size(submeta)
(8447,2)
sub_ids_df = DataFrame(SDSS_ID=sub_ids)
submeta2 = join(submeta, sub_ids_df, on=:SDSS_ID, kind=:inner)
SDSS_ID | Distance | |
---|---|---|
1 | 1237645943979114582 | 169.9678038233618 |
2 | 1237648672922468973 | 198.58047431623933 |
3 | 1237648702986125622 | 518.0174523561253 |
4 | 1237648703505432738 | 102.49314803418802 |
5 | 1237648703508119586 | 98.64965498290596 |
6 | 1237648703509758235 | 366.41300422222224 |
7 | 1237648703516508315 | 510.33046625356116 |
8 | 1237648704045908141 | 132.38698287749284 |
9 | 1237648704046563336 | 54.66301228490028 |
10 | 1237648704057966842 | 357.01779898575495 |
11 | 1237648704061505776 | 272.46095185754984 |
12 | 1237648704596279527 | 230.60958307692312 |
13 | 1237648704600736312 | 354.882525068376 |
14 | 1237648704602702523 | 209.2568439031339 |
15 | 1237648704602767946 | 210.11095347008546 |
16 | 1237648705122074764 | 155.8749959686609 |
17 | 1237648705135641098 | 308.3335536695156 |
18 | 1237648705137607019 | 142.6362976809117 |
19 | 1237648705656783006 | 179.36300905982904 |
20 | 1237648720157540452 | 349.75786766666664 |
21 | 1237648720163045463 | 368.1212233561253 |
22 | 1237648720163111061 | 204.98629606837605 |
23 | 1237648720164225051 | 96.94143584900284 |
24 | 1237648720165470336 | 317.72875890598283 |
25 | 1237648720175235225 | 177.6547899259259 |
26 | 1237648720691134685 | 503.4975897179487 |
27 | 1237648720712237155 | 146.9068455156695 |
28 | 1237648720713679086 | 313.45821107122504 |
29 | 1237648720718201265 | 166.55136555555555 |
30 | 1237648721219158109 | 386.05752426210813 |
⋮ | ⋮ | ⋮ |
issorted(submeta2[:SDSS_ID])
true
size(submeta2)
(1734,2)
all(submeta2[:SDSS_ID] .== sub_ids)
true
subfeats = zeros(eltype(subresmap[1]), length(subresmap), 1+length(subresmap[1]))
subfeats[:, 1] = submeta2[:Distance]
for i = 1:length(subresmap)
subfeats[i, 2:end] = subresmap[i]
end
subpred = XGBoost.predict(boost, subfeats)
submission = DataFrame(pssid=sub_ids,mass=subpred)
pssid | mass | |
---|---|---|
1 | 1237645943979114582 | 10.253867 |
2 | 1237648672922468973 | 10.499608 |
3 | 1237648702986125622 | 11.209326 |
4 | 1237648703505432738 | 9.918384 |
5 | 1237648703508119586 | 9.894254 |
6 | 1237648703509758235 | 10.147396 |
7 | 1237648703516508315 | 11.507912 |
8 | 1237648704045908141 | 10.090234 |
9 | 1237648704046563336 | 10.331856 |
10 | 1237648704057966842 | 11.38165 |
11 | 1237648704061505776 | 10.500261 |
12 | 1237648704596279527 | 11.028665 |
13 | 1237648704600736312 | 10.517279 |
14 | 1237648704602702523 | 9.920931 |
15 | 1237648704602767946 | 9.871428 |
16 | 1237648705122074764 | 10.525749 |
17 | 1237648705135641098 | 10.855779 |
18 | 1237648705137607019 | 9.664072 |
19 | 1237648705656783006 | 10.777236 |
20 | 1237648720157540452 | 11.080576 |
21 | 1237648720163045463 | 11.019354 |
22 | 1237648720163111061 | 10.698677 |
23 | 1237648720164225051 | 9.682376 |
24 | 1237648720165470336 | 11.08739 |
25 | 1237648720175235225 | 10.604492 |
26 | 1237648720691134685 | 11.087707 |
27 | 1237648720712237155 | 10.511227 |
28 | 1237648720713679086 | 10.898017 |
29 | 1237648720718201265 | 10.513243 |
30 | 1237648721219158109 | 11.407637 |
⋮ | ⋮ | ⋮ |
pwd()
"/Users/ken/Coding/astrohack/Test"
writetable("/Users/ken/Coding/astrohack/AstroWhack_20170510_0515", submission, separator=',')
finalfiles = [joinpath(sub_dir, testfile) for testfile in testfiles];
# i=8447
# @show string(submeta[:SDSS_ID][i])
# string(submeta[:SDSS_ID][i]) == testfiles[i][1:end-6]
string((submeta[:SDSS_ID])[i]) = "1237680241434689718"
true
@time finalresmap = pmap(process, finalfiles)
36.799306 seconds (1.17 M allocations: 46.200 MB, 0.09% gc time)
8447-element Array{Any,1}: Float32[-0.0238069,-0.0166045,122.753,-13.807,0.89608,1065.0,3.29731,3.62816,4.40761,10.8722,4.31777,2.26844,0.833554,0.275684,0.15025,0.0581227,0.0410931,0.0295711,0.0201667] Float32[-0.0221257,-0.0153729,199.564,0.0247038,50.3156,716.0,19.3968,5.25251,3.91166,376.235,3.7299,2.92906,0.109497,0.0354269,0.0155805,0.0129812,0.00972552,0.00561858,0.00497146] Float32[-0.0259305,-0.0183745,294.038,-36.3482,11.5167,2599.0,10.7808,3.27875,3.46207,116.226,6.797,2.36486,1.15873,0.472192,0.194996,0.15448,0.128698,0.0785831,0.0507989] Float32[-0.0401618,-0.0276491,280.714,-8.86661,3.84388,1667.0,8.22684,4.83535,4.30855,67.6808,3.78111,2.04784,0.389971,0.262359,0.177349,0.0999863,0.0742735,0.0623478,0.0538916] Float32[-0.0288486,-0.0188661,347.739,-16.9966,9.24423,1760.0,15.1196,4.04307,4.04472,228.603,5.73363,0.528576,0.162444,0.0975928,0.0851719,0.0542921,0.0489128,0.0413476,0.0390132] Float32[-0.03132,-0.0207546,209.749,-13.0841,1.60218,1399.0,6.886,5.31207,5.27506,47.417,1.26693,0.915717,0.120925,0.0759941,0.0634458,0.0538706,0.0449911,0.0388538,0.0328809] Float32[-0.0297873,-0.0197511,118.868,-10.9486,2.78487,780.0,5.13959,3.31948,3.17728,26.4153,4.3197,2.83596,0.987817,0.777083,0.221771,0.172872,0.0640944,0.0425459,0.0222147] Float32[-0.0343748,-0.0243915,271.26,-9.02401,1.77197,1555.0,8.91335,5.59961,5.47906,79.4477,0.981799,0.679148,0.148481,0.128338,0.0818264,0.0600856,0.0531014,0.0449605,0.0384258] Float32[-0.0393923,-0.0285296,61.7705,-1.9654,2.11399,380.0,5.18219,4.97469,4.36143,26.8551,0.4136,0.268716,0.0313034,0.0176143,0.0144345,0.0106531,0.00594507,0.00478446,0.00460341] Float32[-0.0306547,-0.0207662,2263.16,-13.4224,20857.4,2627.0,283.967,1.52827,1.44768,80637.5,280.328,34.0924,22.7342,9.89975,3.47196,1.35418,0.928527,0.681344,0.158414] Float32[-0.0264197,-0.0191202,104.684,-4.71069,0.979912,611.0,4.50021,3.69801,5.10775,20.2519,5.47439,1.88101,0.749192,0.302666,0.0723942,0.0480454,0.0266352,0.0142695,0.0118341] Float32[-0.0337478,-0.0227724,35.5786,-1.79747,0.462266,511.0,2.47139,4.8446,5.84067,6.10775,0.580204,0.146385,0.0233278,0.0171749,0.00945096,0.00610758,0.00423711,0.00365466,0.00331078] Float32[-0.0388794,-0.0255626,141.739,-4.50299,14.8038,723.0,10.9707,4.26369,3.8324,120.356,2.54142,0.298381,0.115318,0.049027,0.039763,0.0343339,0.0301527,0.023905,0.0171862] ⋮ Float32[-0.0388188,-0.0269986,100.075,-5.60099,0.574883,921.0,3.70688,5.83307,6.96142,13.7409,0.592127,0.0598994,0.047491,0.0447444,0.0357101,0.0306349,0.0226849,0.021515,0.0192207] Float32[-0.036615,-0.0252135,85.7534,-11.3435,0.987985,660.0,3.98066,3.93369,5.49533,15.8456,1.58057,0.762529,0.201341,0.128207,0.060366,0.0390301,0.0215266,0.0162954,0.0159873] Float32[-0.0391021,-0.0269316,59.0693,-0.921251,0.269923,888.0,3.35456,6.46051,7.05811,11.2531,0.0430987,0.0275277,0.0272315,0.0163634,0.013477,0.0100181,0.00900955,0.00783415,0.00689445] Float32[-0.0284323,-0.0201301,188.049,-7.83661,1.15097,1156.0,6.50175,4.90257,5.92607,42.2727,3.57348,0.809907,0.159669,0.108959,0.0543403,0.0305299,0.0266505,0.0246743,0.0205679] Float32[-0.026701,-0.0171944,47.2168,-3.99243,1.37963,600.0,3.4145,4.91028,5.47022,11.6588,1.04615,0.0787401,0.0281717,0.0146486,0.0115688,0.0106767,0.00829753,0.00550063,0.00392126] Float32[-0.0436145,-0.0302079,70.2839,-2.24941,3.35821,543.0,5.5219,4.75692,4.75203,30.4914,0.691442,0.21125,0.0741185,0.0359066,0.0270398,0.0255724,0.0229603,0.0133547,0.0102568] Float32[-0.0311165,-0.0218958,95.7188,-7.26275,2.13064,605.0,4.98077,3.13025,4.26754,24.8081,2.43914,1.362,0.528579,0.408507,0.175709,0.130145,0.0563934,0.0301694,0.0177976] Float32[-0.0306875,-0.0213437,102.816,-7.09635,2.47202,784.0,5.47459,4.15071,3.69459,29.9711,1.41146,0.987861,0.238531,0.154596,0.0356693,0.0292889,0.0233591,0.0208892,0.0165469] Float32[-0.0289612,-0.0196617,225.09,-11.4098,2.33924,1111.0,10.5869,4.76706,5.24389,112.083,1.11906,0.376079,0.109124,0.0578122,0.0349365,0.0249957,0.0226111,0.0210753,0.017403] Float32[-0.0268191,-0.0175035,223.42,-13.9846,1.20453,1393.0,7.33851,5.21909,5.89935,53.8537,2.37723,0.640218,0.137647,0.0904851,0.0603988,0.0484254,0.0301716,0.0283286,0.0254232] Float32[-0.0295125,-0.0211301,132.101,-7.40629,1.478,739.0,6.49851,4.5906,5.12735,42.2307,3.05652,0.689978,0.232232,0.0753837,0.0302024,0.0206979,0.0188766,0.0151489,0.0139682] Float32[-0.032073,-0.0208545,78.06,-1.45072,1.17794,807.0,5.30111,5.80034,5.53513,28.1018,0.394634,0.12959,0.0127868,0.0108429,0.0082652,0.0077856,0.00492713,0.0044037,0.00285423]
res = finalresmap
finalfeats = zeros(eltype(res[1]), length(res), 1+length(res[1]))
finalfeats[:, 1] = submeta[:Distance]
for i = 1:length(res)
finalfeats[i, 2:end] = res[i]
end
finalpred = XGBoost.predict(boost, finalfeats)
finaldf = DataFrame(pssid=submeta[:SDSS_ID],mass=finalpred)
pssid | mass | |
---|---|---|
1 | 1237645943978590386 | 10.0421915 |
2 | 1237645943979114582 | 10.253867 |
3 | 1237645943979311221 | 10.291842 |
4 | 1237648672922468973 | 10.499608 |
5 | 1237648673992671592 | 11.1321535 |
6 | 1237648674529476993 | 10.320119 |
7 | 1237648674532753564 | 11.115282 |
8 | 1237648675603874096 | 10.217378 |
9 | 1237648702972887077 | 10.962672 |
10 | 1237648702978654353 | 10.688571 |
11 | 1237648702979244208 | 10.409277 |
12 | 1237648702983897351 | 10.241099 |
13 | 1237648702985797942 | 10.920508 |
14 | 1237648702986125622 | 11.209326 |
15 | 1237648702992023572 | 10.736224 |
16 | 1237648703503794279 | 11.2124 |
17 | 1237648703504515190 | 10.264055 |
18 | 1237648703505432738 | 9.918384 |
19 | 1237648703505629237 | 11.054634 |
20 | 1237648703508119586 | 9.894254 |
21 | 1237648703509758235 | 10.147396 |
22 | 1237648703512772638 | 11.356924 |
23 | 1237648703516508315 | 11.507912 |
24 | 1237648703527649617 | 10.6849375 |
25 | 1237648703531123239 | 10.986324 |
26 | 1237648704045908141 | 10.090234 |
27 | 1237648704046563336 | 10.331856 |
28 | 1237648704054231257 | 10.622571 |
29 | 1237648704055017848 | 9.959886 |
30 | 1237648704057573661 | 11.089472 |
⋮ | ⋮ | ⋮ |
writetable("/Users/ken/Coding/astrohack/AstroWhack_final_predictions_20170501", finaldf, separator=',')