import concurrent.futures
from itertools import count
from pathlib import Path
import numpy as np
import requests
from random import choices, randint
import hashlib
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd
base = 'https://hookup-qubsu.org/home/GetResults'
categories = [
"Activism",
"Community",
"Competing",
"Culture",
"Democracy",
"Gaming",
"Learning",
"MakeFriends",
"Network",
"Outdoors",
"Perform",
"Stayactive"
]
def gen_q():
c = list(np.random.permutation(categories)[:int(np.random.normal((len(categories)-1)//2))])
_c = [categories.index(k) for k in (c)]
q = {
"Categories": c,
"Budget": str(randint(0,10)),
"Time": str(randint(0,10)),
"Travel": str(randint(0,10)),
"Joined": str(randint(0,10))
}
h = hashlib.md5(str(q).encode('utf-8')).digest()
return h,q,_c
def get_clubs(q):
response = requests.post(base, data=q)
content = response.content
duration = response.elapsed.total_seconds()
s = BeautifulSoup(content, 'html.parser')
clubs = [h.get_text() for h in s.select('div.answers > h2')]
return clubs, duration
def get_random_result():
h,q,_c = gen_q()
q['Recommended'], q['Duration']=get_clubs(q)
return q
batch_size=int(1e4)
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
for i in count(): #run forever
while (dest:=Path(f'data/hookup_{i}.pa.pq')).exists():
i+=1
print(dest)
results =[]
futures = {executor.submit(get_random_result) for _ in range(batch_size)}
for future in tqdm(concurrent.futures.as_completed(futures), total=batch_size):
results.append(future.result())
pd.DataFrame(results).to_parquet(dest, engine='pyarrow')