#load "Paket.fsx" Paket.Version [ ("Alea", "3.0.3-beta2") ] Paket.Package [ "NUnit" ] #load "packages/Alea/Alea.fsx" #r "packages/Alea/lib/net45/Alea.Parallel.dll" #r "packages/NUnit/lib/net45/nunit.framework.dll" #load "XPlot.Plotly.Paket.fsx" #load "XPlot.Plotly.fsx" open XPlot.Plotly open System open Alea open Alea.CSharp open Alea.Parallel let gpu = Gpu.Default let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit" gpu gpu.Device.Cores ((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.) (IntPtr.Size * 8) { Html = s } [] let inline binarySearch (n:int) (v:int -> 'T) (x:'T) = let mutable l = 0 let mutable u = n - 1 while u - 1 > l do let m = int((uint32(l) + uint32(u)) >>> 1) if x < (v m) then u <- m else l <- m l let ecdfCpu numSamples numPlotPoints (gen:float[] -> unit) = let numbers = Array.zeroCreate numSamples let plotX = Array.zeroCreate numPlotPoints let plotY = Array.zeroCreate numPlotPoints gen numbers // start measuring time GC.Collect() GC.WaitForPendingFinalizers() let t = System.Diagnostics.Stopwatch.StartNew() // first sort it Array.sortInPlace numbers // grab min max value let min = numbers.[0] let max = numbers.[numSamples - 1] let dist = max - min // binary search to create ecdf for i = 0 to numPlotPoints - 1 do let x = min + (dist / (float numPlotPoints)) * (float i) let v i = numbers.[i] let y = binarySearch numSamples v x let y = (float y) / (float numSamples) plotX.[i] <- x plotY.[i] <- y t.Stop() // to get more accurate timings, we keep these array alive so we are not interrupted by GC collection GC.KeepAlive(numbers) GC.KeepAlive(plotX) GC.KeepAlive(plotY) plotX, plotY, t.Elapsed let ecdfGpu numSamples numPlotPoints (gen:Session -> float[] -> unit) = // create a GPU session, which manages the temporary memory required by the different algrotihms. use session = new Session(gpu) let numbers = session.Allocate(numSamples) let minmax = session.Allocate(2) let plotX = session.Allocate(numPlotPoints) let plotY = session.Allocate(numPlotPoints) gen session numbers // start measuring time, we synchonize gpu first session.Gpu.Synchronize() GC.Collect() GC.WaitForPendingFinalizers() let t = System.Diagnostics.Stopwatch.StartNew() // first sort it session.Sort(numbers, numbers, (fun a b -> a < b)) // grab min max value session.Gpu.Launch((fun () -> minmax.[0] <- numbers.[0] minmax.[1] <- numbers.[numSamples - 1]), LaunchParam(1, 1)) session.For(0, numPlotPoints, (fun i -> let min = minmax.[0] let max = minmax.[1] let dist = max - min let x = min + (dist / (float numPlotPoints)) * (float i) let v i = numbers.[i] let y = binarySearch numSamples v x let y = (float y) / (float numSamples) plotX.[i] <- x plotY.[i] <- y )) // synchornize gpu then stop timer gpu.Synchronize() t.Stop() // to get more accurate timings, we keep these array alive so we are not interrupted by GC collection GC.KeepAlive(numbers) GC.KeepAlive(minmax) GC.KeepAlive(plotX) GC.KeepAlive(plotY) Gpu.CopyToHost(plotX), Gpu.CopyToHost(plotY), t.Elapsed let genGpuUniform (session:Session) (numbers:float[]) = session.UsingPseudoRandom(seed=42UL) session.RandomUniform(numbers) let genGpuNormal (session:Session) (numbers:float[]) = session.UsingPseudoRandom(seed=42UL) session.RandomNormal(numbers, 0.0, 1.0) let genGpuLogNormal (session:Session) (numbers:float[]) = session.UsingPseudoRandom(seed=42UL) session.RandomLogNormal(numbers, 0.0, 1.0) let genCpuUniform (data:float[]) = use rng = cuRAND.Generator.CreateCpu(cuRAND.RngType.PSEUDO_DEFAULT) rng.SetPseudoRandomGeneratorSeed(42UL) rng.GenerateUniform(data) let genCpuNormal (data:float[]) = use rng = cuRAND.Generator.CreateCpu(cuRAND.RngType.PSEUDO_DEFAULT) rng.SetPseudoRandomGeneratorSeed(42UL) rng.GenerateNormal(data, 0.0, 1.0) let genCpuLogNormal (data:float[]) = use rng = cuRAND.Generator.CreateCpu(cuRAND.RngType.PSEUDO_DEFAULT) rng.SetPseudoRandomGeneratorSeed(42UL) rng.GenerateLogNormal(data, 0.0, 1.0) let layout title x y= Layout(title = title, xaxis=Xaxis(title = x, showgrid = false, zeroline = false), yaxis=Yaxis(title = y, showline = false), showlegend = true) let numSamples = 1024*1024 let numPlotPoints = 1000 let x, y, _ = ecdfCpu numSamples numPlotPoints genCpuNormal Scatter(name = "Normal", x = x, y = y, mode = "lines") |> Chart.Plot |> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on CPU" "x" "probability") let x, y, _ = ecdfGpu numSamples numPlotPoints genGpuNormal Scatter(name = "Normal", x = x, y = y, mode = "lines") |> Chart.Plot |> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on GPU" "x" "probability") let x, y, _ = ecdfCpu numSamples numPlotPoints genCpuLogNormal Scatter(name = "Log Normal", x = x, y = y, mode = "lines") |> Chart.Plot |> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on CPU" "x" "probability") let x, y, _ = ecdfGpu numSamples numPlotPoints genGpuLogNormal Scatter(name = "Log Normal", x = x, y = y, mode = "lines") |> Chart.Plot |> Chart.WithLayout (layout "Empirical Cumulative Distribution Function on GPU" "x" "probability") let megaBytes = if Gpu.Default.Device.Cores <= 512 then 10 else 100 let numSamples = 1024*1024*megaBytes let _, _, cpuTime = ecdfCpu numSamples 1000 genCpuNormal let _, _, gpuTime = ecdfGpu numSamples 1000 genGpuNormal let speedup = cpuTime.TotalMilliseconds / gpuTime.TotalMilliseconds type data = { Device: string; Timing: string; Speedup: float } let records = [| { Device = "CPU"; Timing = sprintf "%.2f ms" cpuTime.TotalMilliseconds; Speedup = 1.0} { Device = "GPU"; Timing = sprintf "%.2f ms" gpuTime.TotalMilliseconds; Speedup = speedup} |] records |> Util.Table let megaBytes = if Gpu.Default.Device.Cores <= 512 then [1..10] else [100..100..900] let gpuPerformance = seq { for scale in megaBytes do let numSamples = scale * 1024 * 1024 let _, _, time = ecdfGpu numSamples 1000 genGpuNormal yield (scale, time.TotalMilliseconds) } |> Seq.toList type data = { MegaBytes: int; Timing: string; MegaBytesPerSec: string } gpuPerformance |> List.map (fun (s, t) -> { MegaBytes = s Timing = sprintf "%.2f ms" t MegaBytesPerSec = sprintf "%.2f MB/sec" ((float s) / t * 1000.0) }) |> Util.Table let layout = Layout(title = "Timing in ms", xaxis=Xaxis(title = "Size in MB", showgrid = false, zeroline = false), yaxis=Yaxis(title = "Timing (ms)", showline = false)) Bar(x = (gpuPerformance |> List.map (fun (mb, t) -> sprintf "%d MB" mb)), y = (gpuPerformance |> List.map (fun (mb, t) -> t))) |> Chart.Plot |> Chart.WithLayout layout let layout = Layout(title = "Performance in MB per second", xaxis=Xaxis(title = "Size in MB", showgrid = false, zeroline = false), yaxis=Yaxis(title = "MB per second", showline = false)) Bar(x = (gpuPerformance |> List.map (fun (s, t) -> sprintf "%d MB" s)), y = (gpuPerformance |> List.map (fun (s, t) -> (float s) / t * 1000.0))) |> Chart.Plot |> Chart.WithLayout layout let megaBytes = if Gpu.Default.Device.Cores <= 512 then [1..10] else [100..100..900] let performanceCpuGpu numSamples numPlotTimes = let _, _, cpuTime = ecdfCpu numSamples numPlotTimes genCpuNormal let _, _, gpuTime = ecdfGpu numSamples numPlotTimes genGpuNormal let cpuTime = cpuTime.TotalMilliseconds let gpuTime = gpuTime.TotalMilliseconds let speedup = cpuTime / gpuTime cpuTime, gpuTime, speedup let gpuCpuPerformance = seq { for scale in megaBytes do let numSamples = scale * 1024 * 1024 let cpuTime, gpuTime, speedup = performanceCpuGpu numSamples 1000 yield (scale, cpuTime, gpuTime, speedup) } |> Seq.toList type data = { MegaBytes: int; GPUTiming: string; CPUTiming: string; Speedup: string } gpuCpuPerformance |> List.map (fun (s, ct, gt, speedup) -> { MegaBytes = s GPUTiming = sprintf "%.2f ms" gt CPUTiming = sprintf "%.2f ms" ct Speedup = sprintf "%.2f" speedup }) |> Util.Table let layout = Layout(title = "CPU / GPU Performance in MB per second", xaxis=Xaxis(title = "Size in MB", showgrid = false, zeroline = false), yaxis=Yaxis(title = "MB per second", showline = false), showlegend = true) seq { let x = gpuCpuPerformance |> List.map (fun (s, _, _, _) -> sprintf "%d MB" s) let yCpu = gpuCpuPerformance |> List.map (fun (s, t, _, _) -> (float s) / t * 1000.0) let yGpu = gpuCpuPerformance |> List.map (fun (s, _, t, _) -> (float s) / t * 1000.0) yield Bar(x = x, y = yCpu, name = "CPU") yield Bar(x = x, y = yGpu, name = "GPU") } |> Seq.toList |> Chart.Plot |> Chart.WithLayout layout