Astrohack 2017

Quick & Dirty Solution

Team AstroWhack

  • Ken Bastiaensen
  • (Benjamin Vandermarliere)
  • ((Jannes Nys))

Quick & Dirty

Quick → Julia on my laptop

Dirty → Simplest plan of attack

  • Minimal preprocess to select galaxy
  • Get some features
  • Run XGBoost
  • ...
  • Profits!

Preprocess

  • smooth image and threshold to create binary image
  • use Images.label_components to label connected pixels (blobs)
  • central blob = galaxy
  • crop central galaxy by using symmetry: minimum of deviations around center for each axis
  • set non-label components in crop to zero

Some features:

  • Distance
  • Gert: intensity of light
    • some sums
  • Benjamin: symmetry
    • eigenvalues
  • prof Maarten Baes: lighter galaxies are more bumpy
    • entropy
  • Some guy on internet: different noise levels per image
    • some low percentiles of full image

I want to thank:

  • Gert for great organisation!
  • Julia for fast and beautiful language!
  • XGBoost for amazing regressions!
  • Benjamin and everyone else for fun vibe!
In [1]:
using Images, ImageDraw
In [2]:
using Interact, BenchmarkTools

Load a file and preprocess a bit

In [3]:
sampledir = "/Users/ken/Coding/astrohack/Astrohack/Sample_Data/SAMPLE/"

files = readdir(sampledir)

ids = String[first(split(f, "-")) for f in files if f[end-4]=='g'];
In [4]:
id = ids[27]
@time img_raw = readcsv(joinpath(sampledir, id * "-g.csv"), Float32)
img_gray = colorview(Gray, img_raw)
  1.183841 seconds (2.03 M allocations: 70.628 MB, 1.49% gc time)
Out[4]:
In [5]:
img_peaks = copy(img_raw)
cutoff = min(1, img_raw[fld.(size(img_raw), 2)...])
img_peaks[img_peaks .> 1] = 0
img = colorview(RGB, img_peaks,img_peaks,img_raw)
Out[5]:
In [6]:
summary(img)
Out[6]:
"151×151 ColorView{RGB}(::ImageCore.StackedView{Float32,3,Tuple{Array{Float32,2},Array{Float32,2},Array{Float32,2}}}) with element type ColorTypes.RGB{Float32}"
In [7]:
img_bin = falses(img_raw)
img_smooth = imfilter(img_raw, Kernel.gaussian(2))
bin_th = 0.05
img_bin[img_smooth .> bin_th] = true
colorview(Gray, img_bin)
Out[7]:
In [8]:
labs = label_components(img_bin);

centre = fld.(size(labs), 2)
labs_blob = zeros(img_raw)
labs_ind = labs .== labs[centre...]
labs_blob[labs_ind] = 0.5
Out[8]:
0.5
In [9]:
img_labs = colorview(RGB, labs_blob,img_peaks,copy(img_raw))
Out[9]:
In [10]:
# find box around center that contains the galaxy based on the center label. 
#It's robust by using the galaxy symmetry so we take minimum of opposite directions
@everywhere function findbox(inds)
    ci, cj = fld.(size(inds), 2)
    north = east = south = west = 0
    for j = 1:size(inds, 2)
        for i = 1:size(inds, 1)
            if inds[i,j]
                if i < ci #north
                    ci - i > north && (north = ci - i)
                else #south
                    i - ci > south && (south = i - ci)
                end
                if j < cj
                    cj - j > west  && (west  = cj - j)
                else
                    j - cj > east  && (east  = j - cj)
                end
            end
        end
    end
    min_width  = min(east, west) 
    min_heigth = min(north, south)
    return min_width, min_heigth
end
findbox(labs_ind)
Out[10]:
(24,24)
In [16]:
@benchmark findbox($labs_ind) #it's fast, thanks Julia!
Out[16]:
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     291.487 μs (0.00% GC)
  median time:      321.457 μs (0.00% GC)
  mean time:        339.062 μs (0.00% GC)
  maximum time:     1.559 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
In [11]:
# draw the box on the image
function drawcbox!(img, w, h) #width, height
    ci, cj = fld.(size(img), 2)
    T = eltype(img)
    # left ,right
    for i = ci-h:ci+h
        img[i, cj-w] = one(T)
        img[i, cj+w] = one(T)
    end
    # up,down
    for j = cj-w:cj+w
        img[ci+h, j] = one(T)
        img[ci-h, j] = one(T)
    end
    img
end
Out[11]:
drawcbox! (generic function with 1 method)
In [12]:
drawcbox!(img_labs, findbox(labs_ind)...)
Out[12]:

interactively look at preprocessing result of sample images

In [18]:
function idplot(i, binth=0.05)
    sampledir = "/Users/ken/Coding/astrohack/Astrohack/Sample_Data/SAMPLE/"
    img_raw = readcsv(joinpath(sampledir, ids[i]*"-g.csv"))#, Float32)
    ci, cj = fld.(size(img_raw), 2)
    
    img_peaks = copy(img_raw)
    cutoff = min(1, img_raw[ci, cj])
    img_peaks[img_peaks .> cutoff] = 0
    
    img_bin = falses(img_raw)
    img_smooth = imfilter(img_raw, Kernel.gaussian(2))
    img_bin[img_smooth .> binth] = true
    labels = label_components(img_bin)
    img_cblob = zeros(img_raw)
    labs_ind = labels .== labels[ci, cj]
    img_cblob[labs_ind] = 0.5
    
    img_labs = colorview(RGB, img_cblob, img_peaks, img_raw)
    
    #img_circles = copy(img_labs)
    blobs = blob_LoG(img_raw, 1:4) #TODO issue for float32?
    blobth = 0.1
    bigblobs = [blob for blob in blobs if blob.amplitude > blobth]
    for blob in bigblobs
        circle = CirclePointRadius(blob.location, sqrt(2)*blob.σ)
        draw!(img_labs, circle, RGB(one(eltype(img_raw)),0,0))
    end
                
    drawcbox!(img_labs, findbox(labs_ind)...)
end
WARNING: Method definition idplot(Any) in module Main at In[17]:2 overwritten at In[18]:2.
WARNING: Method definition idplot(Any, Any) in module Main at In[17]:2 overwritten at In[18]:2.
Out[18]:
idplot (generic function with 2 methods)
In [20]:
@manipulate for i = 1:length(ids), 
    binth = 0.01:0.005:0.08
        idplot(i, binth)
end
Out[20]:

quick and dirty approach

(= only entropy and symmetrie)

Preprocess

  • smooth image and threshold to create binary image and use label_components to find central galaxy.
  • crop central galaxy by using symmetry: minimum of deviations around center for each axis
  • set non-label components in crop to zero ## Features
  • distances (given)
  • lower percentiles for full image for noise level (eg lower 1 and 5 percentile)
  • entropy of crop,
  • add sum of values and number of galaxy (nonzero) pixels
  • some symmetry measures from eigenvalues

Then run good ol' XGBoost on features

further ideas

  • entropy after fitting and removing (two consecutive?) gaussian fits
  • use imROF for denoising
  • cut off peaks higher than center value?
  • remove remaining galaxies/stars in the crop instead of crude non-labels to zero by fitting gaussian
In [184]:
ci, cj = fld.(size(img_raw), 2)
BOXMIN = 10
cwidth, cheight = max(findbox(labs_ind), (BOXMIN,BOXMIN))

img_crop = img_raw[ci-cheight:ci+cheight, cj-cwidth:cj+cwidth]
colorview(Gray, img_crop)
Out[184]:
In [32]:
img_onlycblob = copy(img_raw)
img_onlycblob[!labs_ind] = 0
boxsize = max(cwidth, cheight)
img_box = img_raw[ci-boxsize:ci+boxsize, cj-boxsize:cj+boxsize]
colorview(Gray, img_box)
Out[32]:
In [37]:
@time reseigs = eigs(img_box, nev=10)
norm.(reseigs[1]).^2
  0.002158 seconds (738 allocations: 61.313 KB)
Out[37]:
10-element Array{Float32,1}:
 645.704    
  17.6002   
   0.942475 
   0.887299 
   0.0973208
   0.093082 
   0.0130057
   0.0130057
   0.0113171
   0.0113171
In [36]:
@time reseigs = eigs(img_crop'*img_crop, nev=10)
norm.(reseigs[1])
  0.001327 seconds (286 allocations: 31.688 KB)
Out[36]:
10-element Array{Float32,1}:
 647.59     
  17.7356   
   1.0503   
   0.901244 
   0.144213 
   0.129747 
   0.0570758
   0.0509714
   0.0372561
   0.0361984
In [165]:
entropy(img_crop)
Out[165]:
4.054558771384226
In [183]:
noise = img_crop - imfilter(img_crop, Kernel.gaussian(1))

maximum(noise)
Out[183]:
0.5881191265860775
In [175]:
entropy(img_crop - imfilter(img_crop, Kernel.gaussian(2)))
Out[175]:
2.7846815517236028

Process function

In [5]:
using StatsBase
In [1]:
addprocs();
In [16]:
@everywhere using Images
In [6]:
traindir = "/Users/ken/Coding/astrohack/Train/"
trainlist = readdir(traindir);
In [41]:
# remove i-pics once
# i_files = filter(x->x[end-4]=='i', trainlist)
# cd(traindir)
# for file in i_files
#     rm(file)
# end
In [49]:
using DataFrames
In [196]:
trainmeta = readtable("/Users/ken/Coding/astrohack/Train.csv",separator=';')
sort!(trainmeta) #sort on ID to match directory listing!
Out[196]:
SDSS_IDlogMstarerr_logMstarDistance
1123764587957846025511.1009999999999990.028999999999999998518.0174523561253
2123764594182435644310.5390000000000010.032172.10307774074076
3123764594397439613411.3180.015654.2479282849002
4123764594397583772210.8620.026000000000000002285.27259536182333
512376459439783282899.4850.042165.69725598860398
6123764594397839369410.7559999999999980.028999999999999998173.81129687464386
7123764594397898357910.1680.024207.54862476923074
8123764594397898368311.7920.017794.3218972649573
9123764594397911462210.9320.044000000000000004168.25958468945865
10123764594397950795410.820.012158.43732466951565
11123764594397963894510.090.04163.56198207122506
12123764679652671548210.4550.033222.49554219088319
13123764679760052276411.2380.013999999999999999390.32807209686604
1412376486729218139859.5850.057999999999999996187.90410472934474
15123764867292233795510.740.031196.0181456153846
16123764867345868435510.6779999999999990.028999999999999998215.66266565527064
17123764867345888108110.3150.032253.67054138461535
18123764867345907716911.090.011000000000000001252.38937703418802
19123764867345927434411.0279999999999990.022000000000000002400.5773869002848
20123764867345933933111.2050.027999999999999997399.72327733333333
2112376486734596670029.9340.043190.03937864672363
22123764867345966723410.9740.024197.29930996581194
23123764867346058525510.9570.016299.79245799999995
24123764867397143762310.2270.01553.38184793447294
25123764867399516209310.4490000000000020.032131.5328733105413
26123764867399568654911.0240000000000010.019338.22738851282054
27123764867399594810710.2990000000000010.045116.58595588888888
28123764867399699757410.6030.048252.8164318176638
29123764867399712772410.6380.026000000000000002187.90410472934474
30123764867451099559411.5220.013999999999999999289.5431431965812
In [51]:
size(trainmeta,1) == length(trainlist)
Out[51]:
true
In [191]:
@everywhere function process(file)
    
    BINTH  = 0.05 # chosen by eye
    BOXMIN = 10   # minimum crop of 100 pixels
    T = Float32   # typical premature optimization
    
    img_raw = readcsv(file, T) 
    ci, cj = fld.(size(img_raw), 2)
    
    # get percentiles for noise levels
    img_vec = reshape(img_raw, length(img_raw))
    p1 = StatsBase.percentile(img_vec, 1)
    p2 = StatsBase.percentile(img_vec, 5)
    
    # select the galaxy by thresholding smoothed image and labeling blobs
    img_smooth = imfilter(T, img_raw, Kernel.gaussian(2))
    img_bin = img_smooth .> BINTH
    labels = label_components(img_bin)
    labs_ind = labels .== labels[ci, cj]
    
    # crop galaxy
    cwidth, cheight = max(findbox(labs_ind), (BOXMIN,BOXMIN))
    crop_ind = ci-cheight:ci+cheight, cj-cwidth:cj+cwidth
    img_crop = img_raw[crop_ind...]

    #Crudely set non galaxy values in crop to zero to remove possible other galaxy
    labels_ind_crop = labs_ind[crop_ind...]
    img_crop[!labels_ind_crop] = 0
    
    # get top eigenvalues (spectra) for symmetry features
    reseigs = eigs(img_crop'*img_crop, nev=10)
    spect = norm.(reseigs[1])
    
    # get sum of intensities and (squared) noise
    s1 = sum(img_crop)
    img_crop_smooth = view(img_smooth, crop_ind...)
    s2 = sum(img_crop - img_crop_smooth)
    s3 = sum((img_crop - img_crop_smooth).^2)
    s4 = sum(labs_ind) #number of galaxy pixels 
    s5 = norm(img_crop) #why not..
    
    # get entropy 
    e1 = Images.entropy(img_crop)
    e2 = Images.entropy(img_crop - imfilter(T, img_crop, Kernel.gaussian(2)))
    
    return T[p1,p2, s1, s2, s3, s4, s5, e1, e2, spect...]
end
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3.
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3.
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3.
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3.
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3.
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3.
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3.
WARNING: Method definition process(Any) in module Main at In[189]:3 overwritten at In[191]:3WARNING: Method definition process.
(Any) in module Main at In[189]:3 overwritten at In[191]:3.
In [193]:
# test the function
@time process(joinpath(traindir, trainlist[10]))
  0.011020 seconds (35.67 k allocations: 2.102 MB)
Out[193]:
19-element Array{Float32,1}:
   -0.019007 
   -0.0117548
  309.332    
  -11.8464   
   39.1035   
 1283.0      
   19.6011   
    4.088    
    3.27034  
  384.204    
    8.92826  
    0.288739 
    0.174709 
    0.0511862
    0.0380456
    0.0332355
    0.0284405
    0.0271679
    0.0230588
In [217]:
# test on other worker thread
@time remotecall_fetch(process, 2, joinpath(traindir, trainlist[10]))
  0.009558 seconds (162 allocations: 8.563 KB)
Out[217]:
19-element Array{Float32,1}:
   -0.019007 
   -0.0117548
  309.332    
  -11.8464   
   39.1035   
 1283.0      
   19.6011   
    4.088    
    3.27034  
  384.204    
    8.92826  
    0.288739 
    0.174709 
    0.0511882
    0.0380461
    0.0332357
    0.0284385
    0.0271685
    0.0230591
In [68]:
# @time begin
#     N = 100
#     fs = zeros(N, 15)
#     fs[:, 1] = trainmeta[:Distance][1:N]
#     @sync @parallel for i=1:N
#         file = joinpath(traindir, trainlist[i])
#         fs[i, 2:end] = process(file)
#     end
# end
In [197]:
# process all train images in parallel and extract features
# longest step, about 6 minutes on my 2015 macbook pro
@time resmap = pmap(process, String[joinpath(traindir, f) for f in trainlist]);
387.255984 seconds (11.21 M allocations: 448.294 MB, 0.18% gc time)
In [278]:
feats = zeros(eltype(resmap[1]), length(resmap), 1+length(resmap[1]))
feats[:, 1] = trainmeta[:Distance]
for i = 1:length(resmap)
    feats[i, 2:end] = resmap[i]
end
In [201]:
values = convert(Array{Float32}, trainmeta[:logMstar]);

Regression

XGBoost has a known compile issue on mac because clang does not come with multithreaded OpenMP installed. So I installed gcc with homebrew brew install gcc --without-multilib, but you still have to link gcc-6 and g++-6 locally! I installed Pkg.add("XGBoost") and in the deps folder I copied the config.mk to main xgboost folder and changed the gcc and g++ to local /usr/local/bin/gcc-6 paths.

In [218]:
using XGBoost
In [219]:
sum(values .== -99) #missing values!
Out[219]:
1374
In [204]:
# with removing missing values (god damnit)
nrows = length(resmap) - sum(values.==-99)
T = eltype(resmap[1])
feats2 = zeros(T, nrows, size(feats, 2))
values2 = zeros(T, nrows)
err2 = zeros(T, nrows)
currenti = 1
for i = 1:length(resmap)
    if values[i] != -99
        feats2[currenti, 1] = trainmeta[:Distance][i]
        feats2[currenti, 2:end] = resmap[i]
        values2[currenti] = values[i]
        err2[currenti] = trainmeta[:err_logMstar][i]
        currenti += 1
    end
end
size(feats2)
Out[204]:
(74891,20)
In [205]:
using Plots
gr()
Out[205]:
Plots.GRBackend()
In [206]:
histogram(values2)
Out[206]:
8 9 10 11 12 0 2500 5000 7500 y1
In [208]:
rand_ids = shuffle(1:length(values2));

train_n = length(values2) - 1734
train_ids = rand_ids[1:train_n]

#train_sample = sample(1:length(feats), train_n, replace=false)
train_x = feats2[train_ids, :]
train_y = values2[train_ids]

test_n  = 1734
test_ids= rand_ids[train_n:train_n+test_n]
test_x  = feats2[test_ids, :]
test_y  = values2[test_ids]
test_err= err2[test_ids];
In [209]:
@time boost = xgboost(train_x, 50, label=train_y, max_depth=50, learning_rate=0.3)
[1]	train-rmse:7.053658
[2]	train-rmse:4.942040
[3]	train-rmse:3.464624
[4]	train-rmse:2.431568
[5]	train-rmse:1.710097
[6]	train-rmse:1.207397
[7]	train-rmse:0.858015
[8]	train-rmse:0.615704
[9]	train-rmse:0.447069
[10]	train-rmse:0.328264
[11]	train-rmse:0.243532
[12]	train-rmse:0.182273
[13]	train-rmse:0.137520
[14]	train-rmse:0.104366
[15]	train-rmse:0.079667
[16]	train-rmse:0.061113
[17]	train-rmse:0.047157
[18]	train-rmse:0.036617
[19]	train-rmse:0.028589
[20]	train-rmse:0.022415
[21]	train-rmse:0.017671
[22]	train-rmse:0.014006
[23]	train-rmse:0.011178
[24]	train-rmse:0.008958
[25]	train-rmse:0.007220
[26]	train-rmse:0.005842
[27]	train-rmse:0.004757
[28]	train-rmse:0.003882
[29]	train-rmse:0.003181
[30]	train-rmse:0.002616
[31]	train-rmse:0.002161
[32]	train-rmse:0.001797
[33]	train-rmse:0.001501
[34]	train-rmse:0.001260
[35]	train-rmse:0.001060
[36]	train-rmse:0.000909
[37]	train-rmse:0.000785
[38]	train-rmse:0.000686
[39]	train-rmse:0.000604
[40]	train-rmse:0.000551
[41]	train-rmse:0.000512
[42]	train-rmse:0.000505
[43]	train-rmse:0.000468
[44]	train-rmse:0.000467
 15.151809 seconds (7.82 k allocations: 6.371 MB)
[45]	train-rmse:0.000464
[46]	train-rmse:0.000464
[47]	train-rmse:0.000464
[48]	train-rmse:0.000464
[49]	train-rmse:0.000464
[50]	train-rmse:0.000464
Out[209]:
XGBoost.Booster(Ptr{Void} @0x00007f9bcb14c100)
In [210]:
test_pred = XGBoost.predict(boost, test_x);
In [211]:
hcat(test_y, test_pred, test_err)
Out[211]:
1735×3 Array{Float32,2}:
 11.182  11.182    0.009
 11.51   11.5777   0.026
 10.907  10.8396   0.037
 10.59   10.5571   0.041
 11.42   11.2049   0.031
 10.256  10.011    0.024
 11.171  10.6016   0.025
  9.349   9.34016  0.055
 10.296  10.2444   0.014
 10.941  10.942    0.01 
 10.71   10.7143   0.042
 11.307  11.2312   0.034
  9.463   9.29443  0.045
  ⋮                     
  9.971  10.0718   0.03 
 10.808  10.8301   0.022
 10.317  10.4416   0.056
  9.23    9.22179  0.056
  9.746   9.84914  0.033
 11.514  11.4574   0.029
 10.371  10.3303   0.021
 10.349  10.3235   0.033
 10.192   9.96918  0.063
 10.101  10.2186   0.028
 10.792  11.0878   0.023
 10.557  10.4991   0.034
In [212]:
sum((test_y - test_pred).^2 ./ (test_err.^2))
Out[212]:
133952.8f0

Submission

In [236]:
sub_ids = readcsv("validationdata_SDSSID.csv", Int, header=false)
# sort!
sub_ids = sort!(reshape(sub_ids, length(sub_ids))) 
sub_dir = "/Users/ken/Coding/astrohack/Test/"
sub_files = [joinpath(sub_dir, string(id) * "-g.csv") for id in sub_ids];
In [273]:
length(sub_ids)
Out[273]:
1734
KERNEL EXCEPTION
BoundsError: attempt to access 13-element Array{UInt8,1} at index [-1]

 in next at ./strings/string.jl:92 [inlined]
 in getindex(::String, ::Int64) at ./strings/basic.jl:70
 in ind2chr(::String, ::Int64) at ./strings/basic.jl:227
 in complete_request(::ZMQ.Socket, ::IJulia.Msg) at /Users/ken/.julia/v0.5/IJulia/src/handlers.jl:40
 in eventloop(::ZMQ.Socket) at /Users/ken/.julia/v0.5/IJulia/src/eventloop.jl:8
 in (::IJulia.##13#19)() at ./task.jl:360
In [251]:
testfiles = readdir(sub_dir);
In [248]:
# remove i-pics once
# i_files = filter(x->x[end-4]=='i', testfiles)

# cd(sub_dir)
# for file in i_files
#     rm(file)
# end
In [254]:
@time subresmap = pmap(process, sub_files);
  7.182995 seconds (234.93 k allocations: 9.468 MB)
In [291]:
submeta = readtable("/Users/ken/Coding/astrohack/Test_Distance.csv",separator=';')
sort!(submeta)
Out[291]:
SDSS_IDDistance
11237645943978590386160.57259858689457
21237645943979114582169.9678038233618
31237645943979311221152.88561248433047
41237648672922468973198.58047431623933
51237648673992671592232.3178022108262
61237648674529476993137.08458549572646
71237648674532753564371.5376616239317
81237648675603874096140.07396898005695
91237648702972887077367.6941685726495
101237648702978654353212.67328217094013
111237648702979244208231.03663786039883
121237648702983897351287.83492406267806
131237648702985797942299.7924579999999
141237648702986125622518.0174523561253
151237648702992023572144.7715715982906
161237648703503794279300.64656756695155
171237648703504515190108.0448602193732
181237648703505432738102.49314803418802
191237648703505629237477.4472479259259
20123764870350811958698.64965498290596
211237648703509758235366.41300422222224
221237648703512772638374.9540998917379
231237648703516508315510.33046625356116
241237648703527649617102.9202028176638
251237648703531123239396.7338938490028
261237648704045908141132.38698287749284
27123764870404656333654.66301228490028
281237648704054231257222.06848740740742
291237648704055017848149.46917421652418
301237648704057573661184.48766646153842
In [261]:
size(submeta)
Out[261]:
(8447,2)
In [271]:
sub_ids_df = DataFrame(SDSS_ID=sub_ids)
submeta2 = join(submeta, sub_ids_df, on=:SDSS_ID, kind=:inner)
Out[271]:
SDSS_IDDistance
11237645943979114582169.9678038233618
21237648672922468973198.58047431623933
31237648702986125622518.0174523561253
41237648703505432738102.49314803418802
5123764870350811958698.64965498290596
61237648703509758235366.41300422222224
71237648703516508315510.33046625356116
81237648704045908141132.38698287749284
9123764870404656333654.66301228490028
101237648704057966842357.01779898575495
111237648704061505776272.46095185754984
121237648704596279527230.60958307692312
131237648704600736312354.882525068376
141237648704602702523209.2568439031339
151237648704602767946210.11095347008546
161237648705122074764155.8749959686609
171237648705135641098308.3335536695156
181237648705137607019142.6362976809117
191237648705656783006179.36300905982904
201237648720157540452349.75786766666664
211237648720163045463368.1212233561253
221237648720163111061204.98629606837605
23123764872016422505196.94143584900284
241237648720165470336317.72875890598283
251237648720175235225177.6547899259259
261237648720691134685503.4975897179487
271237648720712237155146.9068455156695
281237648720713679086313.45821107122504
291237648720718201265166.55136555555555
301237648721219158109386.05752426210813
In [281]:
issorted(submeta2[:SDSS_ID])
Out[281]:
true
In [272]:
size(submeta2)
Out[272]:
(1734,2)
In [275]:
all(submeta2[:SDSS_ID] .== sub_ids)
Out[275]:
true
In [277]:
subfeats = zeros(eltype(subresmap[1]), length(subresmap), 1+length(subresmap[1]))
subfeats[:, 1] = submeta2[:Distance]
for i = 1:length(subresmap)
    subfeats[i, 2:end] = subresmap[i]
end
In [282]:
subpred = XGBoost.predict(boost, subfeats)
submission = DataFrame(pssid=sub_ids,mass=subpred)
Out[282]:
pssidmass
1123764594397911458210.253867
2123764867292246897310.499608
3123764870298612562211.209326
412376487035054327389.918384
512376487035081195869.894254
6123764870350975823510.147396
7123764870351650831511.507912
8123764870404590814110.090234
9123764870404656333610.331856
10123764870405796684211.38165
11123764870406150577610.500261
12123764870459627952711.028665
13123764870460073631210.517279
1412376487046027025239.920931
1512376487046027679469.871428
16123764870512207476410.525749
17123764870513564109810.855779
1812376487051376070199.664072
19123764870565678300610.777236
20123764872015754045211.080576
21123764872016304546311.019354
22123764872016311106110.698677
2312376487201642250519.682376
24123764872016547033611.08739
25123764872017523522510.604492
26123764872069113468511.087707
27123764872071223715510.511227
28123764872071367908610.898017
29123764872071820126510.513243
30123764872121915810911.407637
In [283]:
pwd()
Out[283]:
"/Users/ken/Coding/astrohack/Test"
In [285]:
writetable("/Users/ken/Coding/astrohack/AstroWhack_20170510_0515", submission, separator=',')

Final submission on full test set

In [ ]:
finalfiles = [joinpath(sub_dir, testfile) for testfile in testfiles];
In [302]:
# i=8447
# @show string(submeta[:SDSS_ID][i])
# string(submeta[:SDSS_ID][i]) == testfiles[i][1:end-6]
string((submeta[:SDSS_ID])[i]) = "1237680241434689718"
Out[302]:
true
In [290]:
@time finalresmap = pmap(process, finalfiles)
 36.799306 seconds (1.17 M allocations: 46.200 MB, 0.09% gc time)
Out[290]:
8447-element Array{Any,1}:
 Float32[-0.0238069,-0.0166045,122.753,-13.807,0.89608,1065.0,3.29731,3.62816,4.40761,10.8722,4.31777,2.26844,0.833554,0.275684,0.15025,0.0581227,0.0410931,0.0295711,0.0201667]            
 Float32[-0.0221257,-0.0153729,199.564,0.0247038,50.3156,716.0,19.3968,5.25251,3.91166,376.235,3.7299,2.92906,0.109497,0.0354269,0.0155805,0.0129812,0.00972552,0.00561858,0.00497146]      
 Float32[-0.0259305,-0.0183745,294.038,-36.3482,11.5167,2599.0,10.7808,3.27875,3.46207,116.226,6.797,2.36486,1.15873,0.472192,0.194996,0.15448,0.128698,0.0785831,0.0507989]                
 Float32[-0.0401618,-0.0276491,280.714,-8.86661,3.84388,1667.0,8.22684,4.83535,4.30855,67.6808,3.78111,2.04784,0.389971,0.262359,0.177349,0.0999863,0.0742735,0.0623478,0.0538916]          
 Float32[-0.0288486,-0.0188661,347.739,-16.9966,9.24423,1760.0,15.1196,4.04307,4.04472,228.603,5.73363,0.528576,0.162444,0.0975928,0.0851719,0.0542921,0.0489128,0.0413476,0.0390132]       
 Float32[-0.03132,-0.0207546,209.749,-13.0841,1.60218,1399.0,6.886,5.31207,5.27506,47.417,1.26693,0.915717,0.120925,0.0759941,0.0634458,0.0538706,0.0449911,0.0388538,0.0328809]            
 Float32[-0.0297873,-0.0197511,118.868,-10.9486,2.78487,780.0,5.13959,3.31948,3.17728,26.4153,4.3197,2.83596,0.987817,0.777083,0.221771,0.172872,0.0640944,0.0425459,0.0222147]             
 Float32[-0.0343748,-0.0243915,271.26,-9.02401,1.77197,1555.0,8.91335,5.59961,5.47906,79.4477,0.981799,0.679148,0.148481,0.128338,0.0818264,0.0600856,0.0531014,0.0449605,0.0384258]        
 Float32[-0.0393923,-0.0285296,61.7705,-1.9654,2.11399,380.0,5.18219,4.97469,4.36143,26.8551,0.4136,0.268716,0.0313034,0.0176143,0.0144345,0.0106531,0.00594507,0.00478446,0.00460341]      
 Float32[-0.0306547,-0.0207662,2263.16,-13.4224,20857.4,2627.0,283.967,1.52827,1.44768,80637.5,280.328,34.0924,22.7342,9.89975,3.47196,1.35418,0.928527,0.681344,0.158414]                  
 Float32[-0.0264197,-0.0191202,104.684,-4.71069,0.979912,611.0,4.50021,3.69801,5.10775,20.2519,5.47439,1.88101,0.749192,0.302666,0.0723942,0.0480454,0.0266352,0.0142695,0.0118341]         
 Float32[-0.0337478,-0.0227724,35.5786,-1.79747,0.462266,511.0,2.47139,4.8446,5.84067,6.10775,0.580204,0.146385,0.0233278,0.0171749,0.00945096,0.00610758,0.00423711,0.00365466,0.00331078] 
 Float32[-0.0388794,-0.0255626,141.739,-4.50299,14.8038,723.0,10.9707,4.26369,3.8324,120.356,2.54142,0.298381,0.115318,0.049027,0.039763,0.0343339,0.0301527,0.023905,0.0171862]            
 ⋮                                                                                                                                                                                          
 Float32[-0.0388188,-0.0269986,100.075,-5.60099,0.574883,921.0,3.70688,5.83307,6.96142,13.7409,0.592127,0.0598994,0.047491,0.0447444,0.0357101,0.0306349,0.0226849,0.021515,0.0192207]      
 Float32[-0.036615,-0.0252135,85.7534,-11.3435,0.987985,660.0,3.98066,3.93369,5.49533,15.8456,1.58057,0.762529,0.201341,0.128207,0.060366,0.0390301,0.0215266,0.0162954,0.0159873]          
 Float32[-0.0391021,-0.0269316,59.0693,-0.921251,0.269923,888.0,3.35456,6.46051,7.05811,11.2531,0.0430987,0.0275277,0.0272315,0.0163634,0.013477,0.0100181,0.00900955,0.00783415,0.00689445]
 Float32[-0.0284323,-0.0201301,188.049,-7.83661,1.15097,1156.0,6.50175,4.90257,5.92607,42.2727,3.57348,0.809907,0.159669,0.108959,0.0543403,0.0305299,0.0266505,0.0246743,0.0205679]        
 Float32[-0.026701,-0.0171944,47.2168,-3.99243,1.37963,600.0,3.4145,4.91028,5.47022,11.6588,1.04615,0.0787401,0.0281717,0.0146486,0.0115688,0.0106767,0.00829753,0.00550063,0.00392126]     
 Float32[-0.0436145,-0.0302079,70.2839,-2.24941,3.35821,543.0,5.5219,4.75692,4.75203,30.4914,0.691442,0.21125,0.0741185,0.0359066,0.0270398,0.0255724,0.0229603,0.0133547,0.0102568]        
 Float32[-0.0311165,-0.0218958,95.7188,-7.26275,2.13064,605.0,4.98077,3.13025,4.26754,24.8081,2.43914,1.362,0.528579,0.408507,0.175709,0.130145,0.0563934,0.0301694,0.0177976]              
 Float32[-0.0306875,-0.0213437,102.816,-7.09635,2.47202,784.0,5.47459,4.15071,3.69459,29.9711,1.41146,0.987861,0.238531,0.154596,0.0356693,0.0292889,0.0233591,0.0208892,0.0165469]         
 Float32[-0.0289612,-0.0196617,225.09,-11.4098,2.33924,1111.0,10.5869,4.76706,5.24389,112.083,1.11906,0.376079,0.109124,0.0578122,0.0349365,0.0249957,0.0226111,0.0210753,0.017403]         
 Float32[-0.0268191,-0.0175035,223.42,-13.9846,1.20453,1393.0,7.33851,5.21909,5.89935,53.8537,2.37723,0.640218,0.137647,0.0904851,0.0603988,0.0484254,0.0301716,0.0283286,0.0254232]        
 Float32[-0.0295125,-0.0211301,132.101,-7.40629,1.478,739.0,6.49851,4.5906,5.12735,42.2307,3.05652,0.689978,0.232232,0.0753837,0.0302024,0.0206979,0.0188766,0.0151489,0.0139682]           
 Float32[-0.032073,-0.0208545,78.06,-1.45072,1.17794,807.0,5.30111,5.80034,5.53513,28.1018,0.394634,0.12959,0.0127868,0.0108429,0.0082652,0.0077856,0.00492713,0.0044037,0.00285423]        
In [306]:
res = finalresmap
finalfeats = zeros(eltype(res[1]), length(res), 1+length(res[1]))
finalfeats[:, 1] = submeta[:Distance]
for i = 1:length(res)
    finalfeats[i, 2:end] = res[i]
end
In [309]:
finalpred = XGBoost.predict(boost, finalfeats)
finaldf   = DataFrame(pssid=submeta[:SDSS_ID],mass=finalpred)
Out[309]:
pssidmass
1123764594397859038610.0421915
2123764594397911458210.253867
3123764594397931122110.291842
4123764867292246897310.499608
5123764867399267159211.1321535
6123764867452947699310.320119
7123764867453275356411.115282
8123764867560387409610.217378
9123764870297288707710.962672
10123764870297865435310.688571
11123764870297924420810.409277
12123764870298389735110.241099
13123764870298579794210.920508
14123764870298612562211.209326
15123764870299202357210.736224
16123764870350379427911.2124
17123764870350451519010.264055
1812376487035054327389.918384
19123764870350562923711.054634
2012376487035081195869.894254
21123764870350975823510.147396
22123764870351277263811.356924
23123764870351650831511.507912
24123764870352764961710.6849375
25123764870353112323910.986324
26123764870404590814110.090234
27123764870404656333610.331856
28123764870405423125710.622571
2912376487040550178489.959886
30123764870405757366111.089472
In [310]:
writetable("/Users/ken/Coding/astrohack/AstroWhack_final_predictions_20170501", finaldf, separator=',')