using StatPlots include("../src/dataimport.jl") # `importauthors()` and `getgenderprob()` functions bio = importauthors("../data/pubdata/bio.csv", "bio") comp = importauthors("../data/pubdata/comp.csv", "comp") alldata = vcat(bio, comp) bio = 0 # to free up memory comp = 0 alldata[:Pfemale], alldata[:Count] = getgenderprob(alldata, "../data/genders/genderAPI_genders.json", :Author_First_Name) pool!(alldata) alldata = alldata[!isna(alldata[:Journal]), :] # remove rows where there's no Journal alldata[1:5, 1:7] means = by(alldata, [:Dataset], df -> DataFrame(MeanPF = mean(dropna(df[:Pfemale])))) bar(means, :MeanPF, xaxis=("Dataset", ([1,2], means[:Dataset])), yaxis=("Percent Female", (0, 0.6), 0:0.1:0.6), legend=false, grid=false, title="Proportion of Female Authors") order = Dict(key => ix for (ix, key) in enumerate(["first", "second", "other", "penultimate", "last"])) function authororder(pos1, pos2) return order[pos1] < order[pos2] end println(authororder("first", "second")) println(authororder("second", "first")) sort!(alldata, cols=:Position, lt=authororder) alldata[1:5, 1:7] byposition = by(alldata, [:Position, :Dataset], df -> mean(dropna(df[:Pfemale]))) sort!(byposition, cols=:Position, lt=authororder) ys = hcat([byposition[byposition[:Dataset] .== x, :x1] for x in levels(byposition[:Dataset])]...) groupedbar(ys, bar_position=:dodge, xaxis=("Author Position", (1:5, levels(alldata[:Position]))), yaxis=("Percent Female", (0, 0.6), 0:0.1:0.6), legend=false, grid=false, title="Proportion of Female Authors") function bootstrap{T<:Number}(a::DataArray{T}, n::Int) means = Float64[] for x in 1:n push!(means, mean(dropna( sample!(a, similar(a, length(a))) ))) end means = dropna(means) return (mean(means), [quantile(means, .025), quantile(means, .975)]) end t = @data([1,2,3,4,5]) (m, ci) = bootstrap(t, 1000) by(alldata, [:Dataset, :Position]) do df (m, ci) = bootstrap(df[:Pfemale], 1000) return DataFrame(Mean=m, Lower=ci[1], Upper=ci[2]) end bioids = Set(levels(alldata[alldata[:Dataset] .== "bio", :ID])) compids = Set(levels(alldata[alldata[:Dataset] .== "comp", :ID])) println("There are $(length(bioids)) articles in the \"bio\" dataset") println("There are $(length(compids)) articles in the \"comp\" dataset") dif = length(setdiff(compids, bioids)) println("There are $dif articles in the \"comp\" dataset that aren't in the \"bio\" dataset") plosfocus = alldata[(alldata[:Dataset] .== "bio")& ((alldata[:Journal] .== String("PLoS Biol."))| (alldata[:Journal] .== String("PLoS Comput. Biol."))), :] df = by(plosfocus, [:Journal, :Position]) do df (m, ci) = bootstrap(df[:Pfemale], 1000) return DataFrame(Mean=m, Lower=ci[1], Upper=ci[2]) end sort!(df, cols=:Position, lt=authororder) show(df) ys = hcat([df[df[:Journal] .== x, :Mean] for x in ["PLoS Biol.", "PLoS Comput. Biol."]]...) groupedbar(ys, bar_position=:dodge, lab=["PLoS Bio", "PLoS Comp Bio"], xaxis=("Author Position", (1:5, ["first", "second", "other", "penultimate", "last"])), yaxis=("Percent Female", (0, 0.6), 0:0.1:0.6), grid=false, title="Proportion of Female Authors in PLoS Journals") bio = alldata[alldata[:Dataset] .== "bio", :] c = countmap(bio[:Journal]) journals = bio[map(x -> c[x] > 1000, bio[:Journal]), :] function checktitle(t::String) for f in ["comput", "omic", "informatic", "system"] if contains(t, f) return true end end return false end data = by(journals, :Journal) do df m = mean(dropna(df[:Pfemale])) return DataFrame(Mean=m) end data[:Color] = [checktitle(x) ? :grey : :black for x in data[:Journal]] data = data[!isna(data[:Mean]), :] sort!(data, cols=:Mean) bar(data[:Mean], color=data[:Color], grid=false, legend=false, bar_edges=false) arxivcs = importauthors("../data/pubdata/arxivcs.csv", "arxivcs") arxivbio = importauthors("../data/pubdata/arxivbio.csv", "arxivbio") arxiv = vcat(arxivbio, arxivcs) arxivcs = 0 arxivbio = 0 pool!(arxiv) arxiv = arxiv[!isna(arxiv[:Author_Name]), :] arxiv[:Pfemale], arxiv[:Count] = getgenderprob(arxiv, "../data/genders/genderAPI_genders.json", :Author_Name) arxivbyposition = bystats(arxiv, [:Dataset, :Position]) plot(arxivbyposition, x=:Position, y=:Mean, color=:Dataset, Scale.color_discrete_manual(my_colors...), Guide.title("Female Authors in arXiv"), Geom.bar(position = :dodge), Scale.x_discrete(levels=["first", "second", "other", "penultimate", "last"]), Theme(bar_spacing=2mm))