In this tutorial, we show how LG in conjunction with other utility packages can be used for extracting the most recent directed acyclic graph (DAG) of the Julia package system. This information can be used for interactive data visualization with D3 like in the following links:
All the packages used in this notebook can be installed with:
for dep in ["HTTP","JSON","GitHub","LightGraphs","ProgressMeter"]
Pkg.add(dep)
end
INFO: Package HTTP is already installed INFO: Package JSON is already installed INFO: Package GitHub is already installed INFO: Package LightGraphs is already installed INFO: Package ProgressMeter is already installed
In order to be able to query information from GitHub without be misinterpreted as a malicious robot, you need to create a personal token in your GitHub settings. Since this token is private, we ask you to save it as an environment variable in your operating system (e.g. set GITHUB_AUTH
in your .bashrc
file). This variable will be read in Julia and used for authentication as follows:
using HTTP
using JSON
using GitHub
using LightGraphs
using ProgressMeter
# authenticate with GitHub to increase query limits
mytoken = ENV["GITHUB_AUTH"]
myauth = GitHub.authenticate(mytoken)
WARNING: deprecated syntax "abstract GitHubType" at /home/juliohm/.julia/v0.6/GitHub/src/utils/GitHubType.jl:20. Use "abstract type GitHubType end" instead. WARNING: deprecated syntax "typealias GitHubString Compat.UTF8String" at /home/juliohm/.julia/v0.6/GitHub/src/utils/GitHubType.jl:22. Use "const GitHubString = Compat.UTF8String" instead. WARNING: deprecated syntax "abstract Authorization" at /home/juliohm/.julia/v0.6/GitHub/src/utils/auth.jl:6. Use "abstract type Authorization end" instead.
GitHub.OAuth2(8cda0d**********************************)
After successful authentication, we are now ready to start coding. First, we extract the names of all registered packages in METADATA and assign to each of them a unique integer id:
# find all packages in METADATA
pkgs = readdir(Pkg.dir("METADATA"))
filterfunc = p -> isdir(joinpath(Pkg.dir("METADATA"), p)) && p ∉ [".git",".test"]
pkgs = filter(filterfunc, pkgs)
# assign each package an id
pkgdict = Dict{String,Int}()
for (i,pkg) in enumerate(pkgs)
push!(pkgdict, pkg => i)
end
pkgdict
Dict{String,Int64} with 1500 entries: "Levenshtein" => 724 "ReadStat" => 1141 "Discretizers" => 326 "SchumakerSpline" => 1209 "FredData" => 455 "GaussQuadrature" => 475 "RecurrenceAnalysis" => 1147 "MKLSparse" => 843 "AnsiColor" => 20 "ProximalOperators" => 1075 "Luxor" => 776 "RobustLeastSquares" => 1186 "Temporal" => 1353 "Robotlib" => 1184 "PiecewiseLinearOpt" => 1026 "JLDArchives" => 665 "MatrixDepot" => 803 "CodeTools" => 168 "NumericSuffixes" => 935 "COBRA" => 162 "Crypto" => 234 "Mongo" => 857 "ROOT" => 1194 "MNIST" => 849 "RandomMatrices" => 1123 ⋮ => ⋮
Using the ids, we can easily build the DAG of packages with LG:
# build DAG
DAG = DiGraph(length(pkgs))
@showprogress 1 "Building graph..." for pkg in pkgs
children = Pkg.dependents(pkg)
for c in children
add_edge!(DAG, pkgdict[pkg], pkgdict[c])
end
end
Building graph...100% Time: 0:04:02
We are interested in finding all the descendents of a package. In other words, we are interested in finding all packages that are influenced by a given package. In this context, we further want to save the level of dependency (or geodesic distance) from descendents to the package being queried. This is a straightforward operation in LG:
# find (indirect) descendents
descendents = []
for pkg in pkgs
gdists = gdistances(DAG, pkgdict[pkg])
desc = [Dict("id"=>pkgs[v], "level"=>gdists[v]) for v in find(gdists .> 0)]
push!(descendents, desc)
end
For each package, we also want to save information about who has contributed to the project. This task is easy to implement with the awesome GitHub.jl API. However, some of the packages registered in METADATA are hosted on different websites such as gitlab, for which an API is missing. We simply skip them and ask authors to migrate their code to GitHub if possible:
# find contributors
pkgcontributors = []
hostnames = []
@showprogress 1 "Finding contributors..." for pkg in pkgs
url = Pkg.Read.url(pkg)
m = match(r".*://([a-z.]*)/(.*)\.git.*", url)
hostname = m[1]; reponame = m[2]
if hostname == "github.com"
users, _ = contributors(reponame, auth=myauth)
usersdata = map(u -> (u["contributor"].login, u["contributions"]), users)
pkgcontrib = [Dict("id"=>u, "contributions"=>c) for (u,c) in usersdata]
push!(pkgcontributors, pkgcontrib)
push!(hostnames, hostname)
else
push!(pkgcontributors, [])
push!(hostnames, hostname)
end
end
Finding contributors...100% Time: 0:12:27
We also extract the Julia version required in the last tag of a package. Both the lower and upper bounds are saved as well as a "cleaned" major.minor
string for the lower bound, which is useful for data visualization:
# find required Julia version
juliaversion = []
for pkg in pkgs
versiondir = joinpath(Pkg.dir("METADATA"), pkg, "versions")
if isdir(versiondir)
latestversion = readdir(versiondir)[end]
reqfile = joinpath(versiondir, latestversion, "requires")
reqs = Pkg.Reqs.parse(reqfile)
if "julia" ∈ keys(reqs)
vinterval = reqs["julia"].intervals[1]
vmin = vinterval.lower
vmax = vinterval.upper
majorminor = "v$(vmin.major).$(vmin.minor)"
push!(juliaversion, Dict("min"=>string(vinterval.lower),
"max"=>string(vinterval.upper),
"majorminor"=>majorminor))
else
push!(juliaversion, Dict("min"=>"NA", "max"=>"NA", "majorminor"=>"NA"))
end
else
push!(juliaversion, Dict("min"=>"BOGUS", "max"=>"BOGUS", "majorminor"=>"BOGUS"))
end
end
Finally, we save the data in a JSON file:
# construct JSON
nodes = [Dict("id"=>pkgs[v],
"indegree"=>indegree(DAG,v),
"outdegree"=>outdegree(DAG,v),
"juliaversion"=>juliaversion[v],
"descendents"=>descendents[v],
"contributors"=>pkgcontributors[v]) for v in vertices(DAG)]
links = [Dict("source"=>pkgs[src(e)], "target"=>pkgs[dst(e)]) for e in edges(DAG)]
data = Dict("nodes"=>nodes, "links"=>links)
# write to file
open("DAG-Julia-Pkgs.json", "w") do f
JSON.print(f, data, 2)
end
Having extracted and saved the DAG of Julia packages, we take this opportunity to find out the Julians responsible for this amazing package system.
We use LG again to build this social network:
# find Julians on Github
julians = []
for pkgcontrib in pkgcontributors
append!(julians, [julian["id"].value for julian in pkgcontrib])
end
julians = sort(unique(julians))
# assign each Julian an id
juliandict = Dict{String,Int}()
for (i,julian) in enumerate(julians)
push!(juliandict, julian => i)
end
juliandict
Dict{String,Int64} with 1558 entries: "credentiality" => 496 "ZacCranko" => 238 "Snnappie" => 206 "benhamner" => 387 "lynyus" => 991 "iraikov" => 781 "nstiurca" => 1143 "pearlzli" => 1180 "GunnarFarneback" => 90 "njwilson23" => 1135 "gustafsson" => 719 "cgoldammer" => 449 "garrison" => 680 "lobingera" => 977 "randyzwitch" => 1232 "JonathanAnderson" => 123 "madanim" => 999 "Armavica" => 20 "Matt5sean3" => 148 "slangangular" => 1356 "raphapr" => 1236 "kuldeepdhaka" => 946 "jdrugo" => 818 "J-Revell" => 103 "fserra" => 668 ⋮ => ⋮
# build the social network
socialnet = Graph(length(julians))
contribdict = Dict{String,Int}()
for pkgcontrib in pkgcontributors
ids = [julian["id"].value for julian in pkgcontrib]
contribs = [julian["contributions"] for julian in pkgcontrib]
for i=1:length(ids)
contribdict[ids[i]] = get(contribdict, ids[i], 0) + contribs[i]
end
for i=1:length(ids), j=1:i-1
add_edge!(socialnet, juliandict[ids[i]], juliandict[ids[j]])
end
end
njulians = nv(socialnet)
nconnections = ne(socialnet)
info("$njulians Julians and $nconnections connections")
INFO: 1558 Julians and 43978 connections
For each node of the social network, we use GitHub API to retrieve user information:
# HTTP requests on https://api.github.com
juliansinfo = []
@showprogress 1 "Retrieving Julian info..." for julian in julians
resp = HTTP.get("https://api.github.com/users/$julian?access_token=$mytoken")
htmlbody = identity(String(resp.body))
push!(juliansinfo, JSON.Parser.parse(htmlbody))
end
Retrieving Julian info...100% Time: 0:04:44
If the user has typed an address on his profile, we find an approximate latitude/longitude with Google Maps geocoding API:
locnames = []
latitudes = []
longitudes = []
countries = []
@showprogress 1 "Geocoding Julian address..." for julian in juliansinfo
address = julian["location"]
if address ≠ nothing
address = replace(address, "–", "")
address = replace(address, " ", "+")
resp = HTTP.get("http://maps.google.com/maps/api/geocode/json?address=$address")
htmlbody = identity(String(resp.body))
results = JSON.Parser.parse(htmlbody)["results"]
if length(results) > 0
geoinfo = results[1]
locname = geoinfo["formatted_address"]
loccoords = geoinfo["geometry"]["location"]
push!(locnames, locname)
push!(latitudes, loccoords["lat"])
push!(longitudes, loccoords["lng"])
for comp in geoinfo["address_components"]
if "country" ∈ comp["types"]
push!(countries, comp["long_name"])
end
end
else
push!(locnames, nothing)
push!(latitudes, nothing)
push!(longitudes, nothing)
push!(countries, nothing)
end
else
push!(locnames, nothing)
push!(latitudes, nothing)
push!(longitudes, nothing)
push!(countries, nothing)
end
end
Geocoding Julian address...100% Time: 0:04:37
Finally, we use JSON again to save the data:
# construct JSON
usernodes = [Dict("id"=>julian["login"],
"name"=>julian["name"],
"avatar_url"=>julian["avatar_url"],
"contributions"=>contribdict[julian["login"]],
"location"=>locnames[i],
"latitude"=>latitudes[i],
"longitude"=>longitudes[i],
"country"=>countries[i]) for (i,julian) in enumerate(juliansinfo)]
userlinks = [Dict("source"=>julians[src(e)], "target"=>julians[dst(e)]) for e in edges(socialnet)]
userdata = Dict("nodes"=>usernodes, "links"=>userlinks)
# write to file
open("Julians.json", "w") do f
JSON.print(f, userdata, 2)
end