include("../src/dataimport.jl") using DataFrames bio = importauthors("../data/pubdata/bio.csv", "bio") comp = importauthors("../data/pubdata/comp.csv", "comp") comp[1:5, :] alldata = vcat(bio, comp) bio = 0 comp = 0 alldata[:izeP], alldata[:izeCount] = getgenderprob( alldata, "../data/genders/genderize_genders.json", :Author_First_Name) alldata[:apiP], alldata[:apiCount] = getgenderprob( alldata, "../data/genders/genderAPI_genders.json", :Author_First_Name) pool!(alldata) alldata = alldata[!isna(alldata[:Journal]), :] # remove rows where there's no Journal biodata = alldata[alldata[:Dataset] .== "bio", :] # get all columns for rows where the Dataset column is "bio" compdata = alldata[alldata[:Dataset] .== "comp", :] biodata[1:5, :] # get the first 5 rows, and all columns using StatPlots gr() izemeans = by(alldata, [:Dataset], df -> DataFrame(MeanPF = mean(dropna(df[:izeP])))) apimeans = by(alldata, [:Dataset], df -> DataFrame(MeanPF = mean(dropna(df[:apiP])))) izemeans[:method] = "genderize" apimeans[:method] = "genderAPI" allmeans = vcat(izemeans, apimeans) ys = hcat([allmeans[allmeans[:Dataset] .== x, :MeanPF] for x in levels(allmeans[:Dataset])]...) groupedbar(ys, bar_position=:dodge, ylims=(0,1), xticks=([1,2],["genderize", "genderAPI"]), lab=["Bio", "comp"], xlabel="Gender Calling Method", ylabel="Percent Female", title="Proportion of Female Authors") genderize_byposition = by(alldata, [:Position, :Dataset], df -> mean(dropna(df[:izeP]))) genderapi_byposition = by(alldata, [:Position, :Dataset], df -> mean(dropna(df[:apiP]))) ys = hcat([genderize_byposition[genderize_byposition[:Dataset] .== x, :x1] for x in levels(genderize_byposition[:Dataset])]...) groupedbar(ys, bar_position=:dodge, ylims=(0,0.6), xticks=(1:5,levels(genderize_byposition[:Position])), lab=levels(genderize_byposition[:Dataset]), xlabel="Author Position", ylabel="Percent Female", title="By Position, Genderize.io") ys = hcat([genderapi_byposition[genderapi_byposition[:Dataset] .== x, :x1] for x in levels(genderapi_byposition[:Dataset])]...) groupedbar(ys, bar_position=:dodge, ylims=(0,0.6), xticks=(1:5,levels(genderapi_byposition[:Position])), lab=levels(genderapi_byposition[:Dataset]), xlabel="Author Position", ylabel="Percent Female", title="By Position, GenderAPI") names = by(biodata, [:Author_First_Name, :izeP, :apiP], df -> DataFrame( izeCount = mean(df[:izeCount]), apiCount = mean(df[:apiCount]), Frequency = length(df[:izeCount]) ) ) names[1:5, :] initials = names[map(x->length(x), names[:Author_First_Name]) .== 1, :] initials[1:26, :] # 1. how many of our names can the service guess? println("Gender-API: $(length(names[names[:apiCount] .!= 0, :Author_First_Name]) / length(names[:Author_First_Name]))") println("Genderize.io: $(length(names[names[:izeCount] .!= 0, :Author_First_Name]) / length(names[:Author_First_Name]))") # 2. what proportion of authors can the service guess (this is a different question) println("Gender-API: $(length(biodata[biodata[:apiCount] .!= 0, :Author_First_Name]) / length(biodata[:Author_First_Name]))") println("Genderize.io: $(length(biodata[biodata[:izeCount] .!= 0, :Author_First_Name]) / length(biodata[:Author_First_Name]))") println("Gender-API: $(length(biodata[(biodata[:apiCount] .!= 0) & (map(x->length(x), biodata[:Author_First_Name]) .!= 1), :Author_First_Name]) / length(biodata[:Author_First_Name]))") println("Genderize.io: $(length(biodata[(biodata[:izeCount] .!= 0) & (map(x->length(x), biodata[:Author_First_Name]) .!= 1), :Author_First_Name]) / length(biodata[:Author_First_Name]))") # Bio data, first author println(mean(dropna(biodata[biodata[:Position] .== "first", :izeP]))) println(mean(dropna(biodata[(biodata[:Position] .== "first") & (map(x->length(x), biodata[:Author_First_Name]) .!= 1), :izeP]))) # Comp data, first author println(mean(dropna(compdata[compdata[:Position] .== "first", :izeP]))) println(mean(dropna(compdata[(compdata[:Position] .== "first") & (map(x->length(x), compdata[:Author_First_Name]) .!= 1), :izeP]))) # Bio data, last author println(mean(dropna(biodata[biodata[:Position] .== "last", :izeP]))) println(mean(dropna(biodata[(biodata[:Position] .== "last") & (map(x->length(x), biodata[:Author_First_Name]) .!= 1), :izeP]))) # Comp data, last author println(mean(dropna(compdata[compdata[:Position] .== "last", :izeP]))) println(mean(dropna(compdata[(compdata[:Position] .== "last") & (map(x->length(x), compdata[:Author_First_Name]) .!= 1), :izeP]))) # 3. for names that can be guessed, how certain can we be that the gender assignment is correct? println("Gender-API: $(mean(biodata[biodata[:apiCount] .!= 0, :apiCount]))") println("Genderize.io: $(mean(biodata[biodata[:izeCount] .!= 0, :izeCount]))") # Excluding initials println("Genderize no initials: $(mean(biodata[(biodata[:izeCount] .!= 0) & (map(x->length(x), biodata[:Author_First_Name]) .!= 1), :izeCount]))") n = names[(names[:apiCount] .> 0) & (names[:izeCount] .> 0), :] n[1:5, :] scatter(n, :Frequency, :izeCount, lab="genderize.io", α=0.5, yaxis=("Count", :log10), xaxis=("Name Frequency", :log10)) scatter!(n, :Frequency, :apiCount, lab="genderAPI", α=0.5)