#load "Paket.fsx" Paket.Version [ ("Alea", "3.0.3-beta2") ] Paket.Package [ "NUnit" ] #load "packages/Alea/Alea.fsx" #r "packages/Alea/lib/net45/Alea.Parallel.dll" #r "packages/NUnit/lib/net45/nunit.framework.dll" #load "XPlot.Plotly.Paket.fsx" #load "XPlot.Plotly.fsx" open XPlot.Plotly open System open System.Threading.Tasks; open Alea open Alea.CSharp open Alea.Parallel let gpu = Gpu.Default let s = sprintf "GPU is %A, Number of Cores %A, GPU DRAM is %.3f GB, Process is %d bit" gpu gpu.Device.Cores ((float) gpu.Device.TotalMemory / 1024. / 1024. / 1024.) (IntPtr.Size * 8) { Html = s } let estimatePiTwoStage verbose rng seed batchSize batchs = let t = System.Diagnostics.Stopwatch.StartNew() use session = new Session(gpu) session.UsingPseudoRandom(rngType=rng, seed=seed) let points = session.Allocate batchSize let values = session.Allocate batchSize let pis = session.Allocate batchs for iBatch = 1 to batchs do session.RandomUniform(points) session.For(0, batchSize, (fun i -> let point = points.[i] let d = point.x * point.x + point.y * point.y values.[i] <- if d < 1.0 then 4.0 else 0.0)) session.Aggregate(batchSize, (fun i -> values.[i]), (fun value -> pis.[iBatch - 1] <- value / (float batchSize)), (fun a b -> a + b)) let pis = Gpu.CopyToHost pis let pi = pis |> Array.average t.Stop() if verbose then printfn "%A" pis printfn "PI = %f, %A" pi t.Elapsed pi, t.Elapsed.TotalMilliseconds, ((pis |> Array.distinct).Length) let estimatePiCompact verbose rng seed batchSize batchs = let t = System.Diagnostics.Stopwatch.StartNew() use session = new Session(gpu) session.UsingPseudoRandom(rngType=rng, seed=seed) let points = session.Allocate batchSize let pis = session.Allocate batchs for iBatch = 1 to batchs do session.RandomUniform(points) session.Aggregate(batchSize, (fun i -> let point = points.[i] let d = point.x * point.x + point.y * point.y if d < 1.0 then 4.0 else 0.0), (fun value -> pis.[iBatch - 1] <- value / (float batchSize)), (fun a b -> a + b)) let pis = Gpu.CopyToHost pis let pi = pis |> Array.average t.Stop() if verbose then printfn "%A" pis printfn "PI = %f, %A" pi t.Elapsed pi, t.Elapsed.TotalMilliseconds, ((pis |> Array.distinct).Length) let estimatePiCpu verbose rngType seed batchSize batchs = let t = System.Diagnostics.Stopwatch.StartNew() let points = Array.zeroCreate (2*batchSize) let values = Array.zeroCreate batchSize let pis = Array.zeroCreate batchs use rng = cuRAND.Generator.CreateCpu rngType for iBatch = 1 to batchs do // skip ahead to the right offset to generate non-overlapping blocks of random numbers let offset = 2UL * (uint64 batchSize) * (uint64 iBatch) rng.SetGeneratorOffset offset rng.GenerateUniform(points) Parallel.For(0, batchSize, (fun i -> let pointx = points.[i] let pointy = points.[batchSize + 1] let d = pointx * pointx + pointy * pointy values.[i] <- if d < 1.0 then 4.0 else 0.0)) |> ignore pis.[iBatch - 1] <- values |> Array.average let pi = pis |> Array.average t.Stop() if verbose then printfn "%A" pis printfn "PI = %f, %A" pi t.Elapsed pi, t.Elapsed.TotalMilliseconds, ((pis |> Array.distinct).Length) let estimatePi batches batchSize impl rng = let cuRANDType () = match rng with | PseudoRandomType.XORWOW -> cuRAND.RngType.PSEUDO_XORWOW | PseudoRandomType.MRG32K3A -> cuRAND.RngType.PSEUDO_MRG32K3A | PseudoRandomType.PHILOX4_32_10 -> cuRAND.RngType.PSEUDO_PHILOX4_32_10 | _ -> failwith "TODO" let seed = 1UL let pi, t, l = match impl with | "gpu compact" -> estimatePiCompact false rng seed batchSize batches | "gpu two stage" -> estimatePiTwoStage false rng seed batchSize batches | "cpu" -> estimatePiCpu false (cuRANDType()) seed batchSize batches | _ -> failwithf "unknown implementation %s" impl pi, t, l, rng, impl let batches, batchSize = if Gpu.Default.Device.Cores <= 512 then 50, 1000000 else 50, 10000000 // JIT to get more accurate timings estimatePi batches batchSize "gpu compact" PseudoRandomType.XORWOW estimatePi batches batchSize "gpu two stage" PseudoRandomType.XORWOW let results = [ estimatePi batches batchSize "gpu compact" PseudoRandomType.XORWOW estimatePi batches batchSize "gpu two stage" PseudoRandomType.XORWOW estimatePi batches batchSize "cpu" PseudoRandomType.XORWOW estimatePi batches batchSize "gpu compact" PseudoRandomType.MRG32K3A estimatePi batches batchSize "gpu two stage" PseudoRandomType.MRG32K3A estimatePi batches batchSize "cpu" PseudoRandomType.MRG32K3A estimatePi batches batchSize "gpu compact" PseudoRandomType.PHILOX4_32_10 estimatePi batches batchSize "gpu two stage" PseudoRandomType.PHILOX4_32_10 estimatePi batches batchSize "cpu" PseudoRandomType.PHILOX4_32_10 ] let cpuTimes = results |> List.filter (fun (pi, t, l, rng, impl) -> impl = "cpu") |> List.map (fun (pi, t, l, rng, impl) -> rng, t) |> Map.ofList type data = { RngType: string; Implementation: string; Value: float; Timing: string; Speedup: string } results |> List.map (fun (pi, t, l, rng, impl) -> { RngType = sprintf "%A" rng Implementation = impl Value = pi Timing = sprintf "%.2f s" (t / 1000.0) Speedup = sprintf "%.1f" ((cpuTimes |> Map.find rng) / t) }) |> Util.Table let layout = Layout(title = "CPU / GPU Performance", xaxis=Xaxis(title = "Random number generators", showgrid = false, zeroline = false), yaxis=Yaxis(title = "Speedup", showline = false), showlegend = true) let grouped = results |> List.groupBy (fun (pi, t, l, rng, impl) -> impl) let bars = grouped |> List.map(fun (impl, s) -> let x = s |> List.map (fun (pi, t, l, rng, impl) -> sprintf "%A" rng) let y = s |> List.map (fun (pi, t, l, rng, impl) -> (cpuTimes |> Map.find rng) / t) Bar(x = x, y = y, name = impl)) bars |> Seq.toList |> Chart.Plot |> Chart.WithLayout layout