Understand what search terms get entered to help guide the how to format page titles of each search result.
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse); library(wmfdata)
})
I reviewed the top search terms for a set of 10 large size Wikipedia (English, Spanish, German, French, Japan, Russiona, Italian, Chinses, Portuguese, and Polish) in August 2020.
#Review top autocomplete search terms on large size wikis
query<-
"
-- find complete search term entered into search widget for all sessions
WITH ranked_searches AS (
SELECT
event.searchSessionid AS search_session,
event.query AS search_query,
--find longest length search query in each session to remove partial searches
RANK() OVER (PARTITION BY event.searchSessionid
ORDER BY LENGTH(event.query) DESC) AS ranking
FROM event.SearchSatisfaction
WHERE year = 2020 and month = 08
AND event.source = 'autocomplete'
AND event.action = 'searchResultPage'
AND wiki IN ('enwiki', 'eswiki', 'dewiki', 'frwiki', 'jawiki', 'ruwiki', 'itwiki', 'zhwiki', 'ptwiki', 'plwiki')
AND useragent.is_bot = false
)
SELECT
search_query,
Count(*) as n_searches
FROM ranked_searches
WHERE
--longest character search term entered in session
ranking = 1
--looking for sessions with at five characters in length
AND LENGTH(search_query) > 2
GROUP BY
search_query
ORDER BY n_searches DESC
LIMIT 100
"
autocomplete_queries_largewiki <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
head(autocomplete_queries_largewiki, 15)
search_query | n_searches | |
---|---|---|
<chr> | <int> | |
1 | nasdaq | 156726 |
2 | 2020 | 31542 |
3 | part of an url | 22403 |
4 | tenet | 22117 |
5 | kamala | 17500 |
6 | covid | 14902 |
7 | lucifer | 14334 |
8 | belarus | 14111 |
9 | joe biden | 13479 |
10 | the | 13173 |
11 | kamala harris | 12880 |
12 | dark | 8601 |
13 | the boys | 8423 |
14 | donald trump | 8208 |
15 | the batman | 7988 |
#Review top full term search terms on large size wikis
query <-
"SELECT
event.query AS search_query,
COUNT(*) AS n_searches
FROM
event.SearchSatisfaction
WHERE
event.action = 'searchResultPage'
AND event.source = 'fulltext'
--top 10 wikis by size
AND wiki IN ('eswiki', 'dewiki', 'frwiki', 'jawiki', 'ruwiki', 'itwiki', 'zhwiki', 'ptwiki', 'plwiki', 'arwiki')
AND year = 2020 and month = 08
AND useragent.is_bot = false
GROUP BY
event.query
ORDER BY n_searches DESC
LIMIT 100"
fulltext_queries_largewiki <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
head(fulltext_queries_largewiki, 10)
search_query | n_searches | |
---|---|---|
<chr> | <int> | |
1 | part of an url | 73912 |
2 | us party affiliation | 6406 |
3 | va privatization | 5110 |
4 | MGM Television | 3859 |
5 | armed teachers | 3410 |
6 | Warner Bros. Television Distribution | 2943 |
7 | "lots of" | 2791 |
8 | "日本語吹替" | 2740 |
9 | Warner Bros. Family Entertainment | 2378 |
10 | Jemeter | 2083 |
Top search terms for a set of 10 small size Wikipedias (Persian, Catalan, Serbian, Indonesian, Norwegian, Korean, Finnish, Hungarian, Czech, and Serbo-Croatian) in August 2020. Note: I selected with at least least 100,000 articles to have enough data for the analysis and avoid any privacy/sensitive data concerns that may result from reviewing small wikis with only few number of searches.
#Review top autocomplete search terms on small size wikis
query<-
"
-- find complete search term entered into search widget for all sessions
WITH ranked_searches AS (
SELECT
event.searchSessionid AS search_session,
event.query AS search_query,
--find longest length search query in each session to remove partial searches
RANK() OVER (PARTITION BY event.searchSessionid
ORDER BY LENGTH(event.query) DESC) AS ranking
FROM event.SearchSatisfaction
WHERE year = 2020 and month = 08
AND event.source = 'autocomplete'
AND event.action = 'searchResultPage'
AND wiki IN ('fawiki', 'cawiki', 'srwiki', 'idwiki', 'nowiki', 'kowiki', 'fiwiki', 'huwiki', 'cswiki', 'shwiki')
AND useragent.is_bot = false
)
SELECT
search_query,
Count(*) as n_searches
FROM ranked_searches
WHERE
--longest character search term entered in session
ranking = 1
--looking for search terms with at least 2 characters in length
AND LENGTH(search_query) > 2
GROUP BY
search_query
ORDER BY n_searches DESC
LIMIT 100
"
autocomplete_queries_smallwiki <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
head(autocomplete_queries_smallwiki, 15)
search_query | n_searches | |
---|---|---|
<chr> | <int> | |
1 | libanon | 1067 |
2 | bělorusko | 983 |
3 | 2020 | 720 |
4 | ledek | 668 |
5 | suomi | 535 |
6 | سوپر جام | 441 |
7 | covid | 344 |
8 | usa | 341 |
9 | ایران | 322 |
10 | praha | 318 |
11 | česká republika | 309 |
12 | سکس | 292 |
13 | valko | 277 |
14 | 대한민국 | 271 |
15 | česko | 268 |
Based on the above, the top search terms on the small and larger wikis reviewed in August are single words. "Kamala" received more views than "Kamala Harris" indicating that a larger number of people selected the search result provided in the drop down menu or pressed the search button prior to entering her entire name into the search box.
There are a couple terms such as "part of an url" that are likely caused by unidentified bots. Other common search terms appear to include names of people and events currently in the news.
There's also a large number of searches that start with "the". For "the" searches, this was the longest recorded search term for those sessions so they either started typing a search starting with "the" and selected one of the provided drop down results or more likely just abandoned the search.
query <-
"
-- find complete search term entered into search widget for all sessions
WITH ranked_searches AS (
SELECT
event.searchSessionid AS search_session,
event.query AS search_query,
--find longest length search query in each session to remove partial searches
RANK() OVER (PARTITION BY event.searchSessionid
ORDER BY LENGTH(event.query) DESC) AS ranking
FROM event.SearchSatisfaction
WHERE year = 2020 and month = 08
AND event.source = 'autocomplete'
AND event.action = 'searchResultPage'
AND wiki IN ('fawiki', 'cawiki', 'srwiki', 'idwiki', 'nowiki', 'kowiki', 'fiwiki', 'huwiki', 'cswiki', 'shwiki')
AND useragent.is_bot = false
)
SELECT
-- find number of words
(LENGTH(search_query) - LENGTH(REGEXP_REPLACE(search_query,' ',''))+1) AS num_words,
Count(*) AS n_searches
FROM ranked_searches
WHERE
--longest character search term entered in session
ranking = 1
--looking for search terms with at least 2 characters in length
AND LENGTH(search_query) > 2
GROUP BY
(LENGTH(search_query) - LENGTH(REGEXP_REPLACE(search_query,' ',''))+1)
ORDER BY n_searches DESC
LIMIT 100"
autocomplete_queries_wordcount <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
head(autocomplete_queries_wordcount, 20)
num_words | n_searches | |
---|---|---|
<int> | <int> | |
1 | 1 | 914678 |
2 | 2 | 733311 |
3 | 3 | 174888 |
4 | 4 | 55795 |
5 | 5 | 21810 |
6 | 6 | 10633 |
7 | 7 | 5564 |
8 | 8 | 3058 |
9 | 9 | 1963 |
10 | 10 | 1127 |
11 | 11 | 794 |
12 | 12 | 594 |
13 | 13 | 425 |
14 | 14 | 291 |
15 | 15 | 248 |
16 | 16 | 219 |
17 | 17 | 193 |
18 | 20 | 152 |
19 | 18 | 140 |
20 | 19 | 131 |
autocomplete_queries_wordcount_prop <- autocomplete_queries_wordcount %>%
mutate(prop_searches = round(n_searches/sum(n_searches) *100, 2))
head(autocomplete_queries_wordcount_prop, 10)
num_words | n_searches | prop_searches | |
---|---|---|---|
<int> | <int> | <dbl> | |
1 | 1 | 914678 | 47.46 |
2 | 2 | 733311 | 38.05 |
3 | 3 | 174888 | 9.07 |
4 | 4 | 55795 | 2.90 |
5 | 5 | 21810 | 1.13 |
6 | 6 | 10633 | 0.55 |
7 | 7 | 5564 | 0.29 |
8 | 8 | 3058 | 0.16 |
9 | 9 | 1963 | 0.10 |
10 | 10 | 1127 | 0.06 |
One word or two word searches account for 85.5% of autocomplete searches. One word searches are conducted more frequently (by 24.7%) than two word searches.