#!pip install gensim seaborn wordcloud rank_bm25 nltk ktrain holoviews datashader
#!pip install https://github.com/lambdaofgod/ktrain/archive/nmf_topic_modeling.zip
import re
import tqdm
import requests
import numpy as np
from markdown import markdown
import nltk
import pandas as pd
from pandas.io.json import json_normalize
from gensim import summarization
from sklearn import decomposition, feature_extraction, manifold
from sklearn.feature_extraction import stop_words
from sklearn import pipeline
import rank_bm25
import seaborn as sns
import wordcloud
import matplotlib.pyplot as plt
from IPython.display import Markdown, display
import bokeh.model
import bokeh.plotting
import bokeh.io
import re
import umap
from sklearn import metrics
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import datashade, dynspread
from holoviews.operation import decimate
bokeh.io.output_notebook()
plt.style.use('ggplot')
def printmd(string):
display(Markdown(string))
def get_word_cloud(texts):
text = ' '.join(texts)
return wordcloud.WordCloud(max_font_size=40).generate(text)
def show_word_cloud(wc, figure_kwargs={'figsize': (8, 5)}):
plt.figure(**figure_kwargs)
plt.imshow(wc)
plt.axis('off')
plt.show()
def show_word_cloud_from_texts(text_column):
texts = text_column.fillna('').values
cloud = get_word_cloud(texts)
show_word_cloud(cloud)
Put your Github token to github_auth_key.txt
You have to do this because, unfortunately, for now there is no way to use GraphQL without authentication.
key = open('github_auth_key.txt', 'r').read().strip()
Note that I barely know GraphQL: I made this query in Github's API explorer
headers = {'Authorization': 'token ' + key}
def run_query(query): # A simple function to use requests.post to make the API call. Note the json= section.
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
if request.status_code == 200:
return request.json()
else:
raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
# The GraphQL query (with a few aditional bits included) itself defined as a multi-line string.
def get_next_paged_result(result):
if result is None:
end_cursor_string = ''
else:
end_cursor_string = 'after: "{}"'.format(result['data']['viewer']['starredRepositories']['pageInfo']['endCursor'])
new_query_string = """{{
viewer {{
starredRepositories(first: 100, {}) {{
pageInfo {{
startCursor
hasNextPage
endCursor
}}
nodes {{
name
owner {{
login
}}
description
primaryLanguage {{
name
}}
repositoryTopics(first: 10) {{
nodes {{
topic {{
name
}}
}}
}}
object(expression: "master:README.md") {{
... on Blob {{
text
}}
}}
}}
}}
}}
}}""".format(end_cursor_string)
return run_query(new_query_string)
def get_starred_repo_information(n_pages=6):
next_result = None
starred_repo_information = []
for __ in tqdm.tqdm(range(n_pages)):
next_result = get_next_paged_result(next_result)
starred_repo_information = starred_repo_information + next_result['data']['viewer']['starredRepositories']['nodes']
return starred_repo_information
starred_repo_information = get_starred_repo_information()
100%|██████████| 6/6 [00:28<00:00, 4.80s/it]
from nltk import stem, tokenize
lemmatizer = stem.WordNetLemmatizer()
lemmatizer.lemmatize('repositories')
def clean_and_stem(text):
cleaned_text = re.sub('^[0-9a-zA-Z]+', ' ' , text.lower())
return ' '.join([lemmatizer.lemmatize(w) for w in tokenize.wordpunct_tokenize(cleaned_text)])
def get_cleaned_starred_repositories_df(repo_information):
repo_df = json_normalize(repo_information)
repo_df.index = repo_df['name']
repo_df.drop('name', axis=1, inplace=True)
repo_df['primaryLanguage'] = repo_df['primaryLanguage.name']
repo_df.drop('primaryLanguage.name', axis=1)
repo_df['topics'] = repo_df['repositoryTopics.nodes'].apply(lambda recs: [r['topic']['name'] for r in recs])
repo_df['topics'] = repo_df['topics'].apply(lambda ts: [lemmatizer.lemmatize(t) for t in ts])
repo_df['description'].fillna('', inplace=True)
repo_df['description_stemmed'] = repo_df['description'].apply(clean_and_stem)
repo_df['description_keywords'] = repo_df['description_stemmed'].apply(summarization.keywords)
repo_df['description_length'] = repo_df['description'].str.split().apply(lambda l: 0 if l is None else len(l))
repo_df = repo_df[repo_df['description_length'] > 0]
return repo_df
def get_topic_representant_indices(topic_weights, topic_idx, num_representants=5):
indices = topic_weights[:, topic_idx].argsort()[::-1]
return indices[:num_representants]
def get_repos_representing_topic(repo_df, topic_weights, topic_idx, num_representants=5):
return repo_df.iloc[get_topic_representant_indices(topic_weights, topic_idx, num_representants)]
def plot_description_lengths(description_lengths):
hist, edges = np.histogram(description_lengths.values, bins=25)
median_description_length = description_lengths.median()
mean_description_length = description_lengths.mean()
p = bokeh.plotting.figure(
title='Description length',
x_axis_label='words in description',
y_axis_label='number of repositories',
plot_height=600, plot_width=800)
p.quad(top=hist, left=edges[:-1], right=edges[1:], bottom=0)
p.line([median_description_length, median_description_length], [0, 140], line_color='red')
bokeh.plotting.show(p)
from bokeh import palettes
def plot_2d_data(data, text_label, cls, show_text=True, subset=None):
palette = palettes.d3['Category20']
x, y = data[:, 0], data[:, 1]
source_df = pd.DataFrame({'x': x, 'y': y, 'text_label': text_label, 'color': [palette[c + 3][c] for c in cls]})
source = bokeh.models.ColumnDataSource(source_df)
TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,"
p = bokeh.plotting.figure(tools=TOOLS, plot_width=800, plot_height=600)
p.scatter(x='x', y='y', source=source, fill_color='color', line_color='color')
if subset is not None:
text_labels = bokeh.models.LabelSet(x='x', y='y', text='text_label', level='glyph',
x_offset=5, y_offset=5, source=bokeh.models.ColumnDataSource(source_df.iloc[subset]), render_mode='canvas', text_font_size='7pt')
p.add_layout(text_labels)
bokeh.plotting.show(p)
starred_repo_df = get_cleaned_starred_repositories_df(starred_repo_information)
Primary language
language_counts = starred_repo_df['primaryLanguage'].fillna('unspecified').value_counts()
p = bokeh.plotting.figure(x_range=list(language_counts.index), title='Repository number by language')
p.vbar(x=language_counts.index, top=language_counts, width=1)
p.xaxis.major_label_orientation = "vertical"
bokeh.plotting.show(p)
print(starred_repo_df['description_length'].describe())
plot_description_lengths(starred_repo_df['description_length'])
count 571.000000 mean 10.082312 std 6.451048 min 1.000000 25% 6.000000 50% 9.000000 75% 12.000000 max 69.000000 Name: description_length, dtype: float64
starred_repo_df = starred_repo_df[starred_repo_df['description_length'] > 5]
show_word_cloud_from_texts(starred_repo_df['topics'].apply(' '.join))
Descriptions word cloud
show_word_cloud_from_texts(starred_repo_df['description_stemmed'])
show_word_cloud_from_texts(starred_repo_df['description_keywords'])
We have a collection of documents $d_i$ and want to find some documents.
We formulate a query $q$ for which the system returns some documents with relevance scores.
System can be evaluated (for queries with known responses) as a classifier.
Because of that we use precision and recall scores (why these instead of accuracy?)
Also we can use ranking metrics.
substring matching
break down texts into word and match them
represent documents and queries as vectors
use similarity/disssimilarity (distance) to score vectors for a query
TF-IDF, BM-25 can be interpreted as this - similarity is calculated as dot product in appropriate space
sklearn.text.preprocessing.{Count|TfIdf}Vectorizer
Now we can use machine learning!
import rank_bm25
class SearchEngine:
def __init__(self, df, bm25_cls=rank_bm25.BM25Okapi, text_col='text'):
self.bm25 = bm25_cls(df[text_col].str.split())
self.df = df
def search(self, query, k=100):
scores = self.bm25.get_scores(query.split())
#scores = scores[scores > 0]
relevant_indices = np.argsort(-scores)[:k]
return self.df.iloc[relevant_indices[scores[relevant_indices] > 0]]
search_engine = SearchEngine(starred_repo_df, text_col='description_stemmed')
search_engine.search('information retrieval')
description | owner.login | primaryLanguage.name | repositoryTopics.nodes | object.text | primaryLanguage | object | topics | description_stemmed | description_keywords | description_length | |
---|---|---|---|---|---|---|---|---|---|---|---|
name | |||||||||||
musicinformationretrieval.com | Instructional notebooks on music information r... | stevetjoa | Jupyter Notebook | [{'topic': {'name': 'ipython-notebook'}}, {'to... | stanford-mir\n============\n\n[![Stories in Re... | Jupyter Notebook | NaN | [ipython-notebook, music-information-retrieval... | notebook on music information retrieval . | 6 | |
anserini | A Lucene toolkit for replicable information re... | castorini | Java | [{'topic': {'name': 'information-retrieval'}},... | Anserini\n========\n[![Build Status](https://t... | Java | NaN | [information-retrieval, lucene] | lucene toolkit for replicable information retr... | retrieval | 8 |
awesome-information-retrieval | A curated list of awesome information retrieva... | harpribot | NaN | [] | # Awesome Information Retrieval [![Awesome](ht... | NaN | NaN | [] | curated list of awesome information retrieval ... | retrieval | 8 |
LIRE | Open source library for content based image re... | dermotte | Java | [{'topic': {'name': 'image-retrieval'}}, {'top... | # LIRE - Lucene Image Retrieval\nLIRE (Lucene ... | Java | NaN | [image-retrieval, lira, multimedia] | source library for content based image retriev... | retrieval | 12 |
wikIR | A python tool for building large scale Wikiped... | getalp | Python | [] | # WIKIR\nA python tool for building large scal... | Python | NaN | [] | python tool for building large scale wikipedia... | retrieval\nlarge | 11 |
pytrec_eval | pytrec_eval is an Information Retrieval evalua... | cvangysel | C++ | [{'topic': {'name': 'information-retrieval'}},... | pytrec_eval\n===========\n\npytrec\_eval is a ... | C++ | NaN | [information-retrieval, evaluation] | _eval is an information retrieval evaluation t... | evaluation | 14 |
query-expansion | Developing different methods for expanding a q... | phosseini | Python | [] | # query-expansion\n\nThis repository is dedica... | Python | NaN | [] | different method for expanding a query / topic... | expanding\nexpanded\nquery | 19 |
cnnimageretrieval-pytorch | CNN Image Retrieval in PyTorch: Training and e... | filipradenovic | Python | [{'topic': {'name': 'image-retrieval'}}, {'top... | ## CNN Image Retrieval in PyTorch: Training an... | Python | NaN | [image-retrieval, convolutional-neural-network... | image retrieval in pytorch : training and eval... | retrieval | 14 |
StarSpace | Learning embeddings for classification, retrie... | facebookresearch | C++ | [] | <p align="center"><img width="15%" src="exampl... | C++ | NaN | [] | embeddings for classification , retrieval and ... | 7 | |
sparse_recovery | noiseless/nonnegative sparse recovery and feat... | NLPrinceton | Python | [] | # sparse_recovery\n\nThis module provides solv... | Python | NaN | [] | / nonnegative sparse recovery and feature retr... | sparse | 9 |
revisitop | Revisiting Oxford and Paris: Large-Scale Image... | filipradenovic | Python | [{'topic': {'name': 'image-retrieval'}}, {'top... | # Revisiting Oxford and Paris: Large-Scale Ima... | Python | NaN | [image-retrieval, matlab, python] | oxford and paris : large - scale image retriev... | large | 8 |
CBIR | 🏞 A content-based image retrieval (CBIR) system | pochih | Python | [{'topic': {'name': 'image-retrieval'}}, {'top... | [![Open Source Love](https://badges.frapsoft.c... | Python | NaN | [image-retrieval, computer-vision, gabor, hog,... | 🏞 a content - based image retrieval ( cbir ) s... | based | 7 |
deep-image-retrieval | End-to-end learning of deep visual representat... | almazan | Python | [] | # Deep Image Retrieval\n\nThis repository cont... | Python | NaN | [] | - to - end learning of deep visual representat... | visual | 9 |
minmaxcsa | MinMax Circular Sector Arc for External Plagia... | duartefellipe | Python | [{'topic': {'name': 'plagiarism-detection'}}, ... | ## Minmax Circular Sector Arcs (MinMaxCSA): A ... | Python | NaN | [plagiarism-detection, locality-sensitive-hash... | circular sector arc for external plagiarism ’ ... | sector | 11 |
ir-python | A python implementation for information retrie... | zxzlogic | Python | [] | # ir-python\nA python implementation for infor... | Python | NaN | [] | python implementation for information retrieva... | retrieval\npython\nindex\nindexing\nsafe\ngoogle | 34 |
ParetoMTL | Code for Neural Information Processing Systems... | Xi-L | Python | [] | # Pareto Multi-Task Learning\nCode for Neural ... | Python | NaN | [] | for neural information processing system ( neu... | information | 12 |
Open-IE-Papers | Open Information Extraction (OpenIE) and Open ... | NPCai | NaN | [{'topic': {'name': 'openie'}}, {'topic': {'na... | # Table of Contents\n\n1. [General](#general)\... | NaN | NaN | [openie, literature-review, paper, nlp, inform... | information extraction ( openie ) and open rel... | extraction | 12 |
hashing-baseline-for-image-retrieval | :octocat:Various hashing methods for image ret... | willard-yuan | MATLAB | [{'topic': {'name': 'hashing-library'}}, {'top... | # HABIR Toolkit\n\n[![License](https://img.shi... | MATLAB | NaN | [hashing-library, image-retrieval, ann] | : octocat : various hashing method for image r... | 11 | |
paws | This dataset contains 108,463 human-labeled an... | google-research-datasets | Python | [] | # PAWS: Paraphrase Adversaries from Word Scram... | Python | NaN | [] | dataset contains 108 , 463 human - labeled and... | labeled\nstructure | 28 |
berkeley-doc-summarizer | The Berkeley Document Summarizer is a learning... | gregdurrett | Scala | [] | berkeley-doc-summarizer\n=====================... | Scala | NaN | [] | berkeley document summarizer is a learning - b... | document\nsyntactic\nbased | 28 |
BM comes from 'Best Match'
Difference between TF-IDF: is not symmetrical (query and documents are treated in a different way, for example because their lengths tend to differ)
Pros:
Cons:
from sklearn import feature_extraction
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
term_document_matrix = vectorizer.fit_transform(starred_repo_df['description_stemmed'])
term_document_matrix.shape
(459, 5005)
search_engine.search('image')
description | owner.login | primaryLanguage.name | repositoryTopics.nodes | object.text | primaryLanguage | object | topics | description_stemmed | description_keywords | description_length | |
---|---|---|---|---|---|---|---|---|---|---|---|
name | |||||||||||
open-images | Build an example image classifier using Google... | quiltdata | Jupyter Notebook | [] | # open images\n\nThis repository contains the ... | Jupyter Notebook | NaN | [] | an example image classifier using google open ... | image | 10 |
nsfw_data_source_urls | Collection of NSFW images URLs for the purpose... | EBazarov | NaN | [] | # NSFW data source URLs\n\n## Description\n\nR... | NaN | NaN | [] | of nsfw image url for the purpose of training ... | 14 | |
cnnimageretrieval-pytorch | CNN Image Retrieval in PyTorch: Training and e... | filipradenovic | Python | [{'topic': {'name': 'image-retrieval'}}, {'top... | ## CNN Image Retrieval in PyTorch: Training an... | Python | NaN | [image-retrieval, convolutional-neural-network... | image retrieval in pytorch : training and eval... | retrieval | 14 |
FUNIT_tensorflow | Tensorflow Implementation of FUNIT: Few-Shot U... | zhangqianhui | Python | [{'topic': {'name': 'image-to-image-translatio... | # FUNIT_tensorflow\nTensorflow Implementation ... | Python | NaN | [image-to-image-translation, few-shot-learning... | implementation of funit : few - shot unsupervi... | 8 | |
imagehash | A Python Perceptual Image Hashing Module | JohannesBuchner | Python | [] | NaN | Python | NaN | [] | python perceptual image hashing module | perceptual | 6 |
nsfw_data_scraper | Collection of scripts to aggregate image data ... | alex000kim | Shell | [{'topic': {'name': 'nsfw-classifier'}}, {'top... | # NSFW Data Scraper\n\n## Note: use with cauti... | Shell | NaN | [nsfw-classifier, nsfw, deep-learning, content... | of script to aggregate image data for the purp... | image | 16 |
image-to-image-papers | 🦓<->🦒 🌃<->🌆 A collection of image to image pa... | lzhbrian | NaN | [{'topic': {'name': 'image-to-image'}}, {'topi... | # Image-to-Image papers\n\nA collection of ima... | NaN | NaN | [image-to-image, generative-adversarial-networ... | 🦓<->🦒 🌃<->🌆 a collection of image to image pap... | constantly | 13 |
snowy | Small Image Library for Python 3 | prideout | Python | [{'topic': {'name': 'python'}}, {'topic': {'na... | [![Build Status](https://travis-ci.org/prideou... | Python | NaN | [python, image-processing] | small image library for python 3 | 6 | |
image-match | 🎇 Quickly search over billions of images | EdjoLabs | Python | [{'topic': {'name': 'image-analysis'}}, {'topi... | [![PyPI](https://img.shields.io/pypi/status/im... | Python | NaN | [image-analysis, image-signatures, python, sea... | 🎇 quickly search over billion of image | 7 | |
revisitop | Revisiting Oxford and Paris: Large-Scale Image... | filipradenovic | Python | [{'topic': {'name': 'image-retrieval'}}, {'top... | # Revisiting Oxford and Paris: Large-Scale Ima... | Python | NaN | [image-retrieval, matlab, python] | oxford and paris : large - scale image retriev... | large | 8 |
nsfw-v2 | NSFW Image Detector with REST interface develo... | sajithm | Python | [{'topic': {'name': 'python'}}, {'topic': {'na... | # nsfw-v2\nAn NSFW detector serving responses ... | Python | NaN | [python, nsfw-recognition, kera, flask, convol... | image detector with rest interface developed u... | interface | 11 |
Image-to-Image-Search | A reverse image search engine powered by elast... | sethuiyer | Python | [{'topic': {'name': 'deep-learning'}}, {'topic... | <img src="static/logo.jpg"/>\n\nSmartSearch is... | Python | NaN | [deep-learning, search-engine, elasticsearch, ... | reverse image search engine powered by elastic... | search | 11 |
MAX-Object-Detector | Localize and identify multiple objects in a s... | IBM | Python | [{'topic': {'name': 'docker-image'}}, {'topic'... | [![Build Status](https://travis-ci.com/IBM/MAX... | Python | NaN | [docker-image, machine-learning, machine-learn... | localize and identify multiple object in a sin... | multiple | 9 |
imgdupes | Finding and deleting near-duplicate images bas... | knjcode | Python | [{'topic': {'name': 'image'}}, {'topic': {'nam... | # imgdupes\n\n`imgdupes` is a command line too... | Python | NaN | [image, dedupe, perceptual-hashing, perceptual... | and deleting near - duplicate image based on p... | image | 9 |
albumentations | fast image augmentation library and easy to us... | albumentations-team | Python | [{'topic': {'name': 'image-augmentation'}}, {'... | # Albumentations\n[![PyPI version](https://bad... | Python | NaN | [image-augmentation, machine-learning, augment... | image augmentation library and easy to use wra... | augmentation | 12 |
CBIR | 🏞 A content-based image retrieval (CBIR) system | pochih | Python | [{'topic': {'name': 'image-retrieval'}}, {'top... | [![Open Source Love](https://badges.frapsoft.c... | Python | NaN | [image-retrieval, computer-vision, gabor, hog,... | 🏞 a content - based image retrieval ( cbir ) s... | based | 7 |
deep-image-retrieval | End-to-end learning of deep visual representat... | almazan | Python | [] | # Deep Image Retrieval\n\nThis repository cont... | Python | NaN | [] | - to - end learning of deep visual representat... | visual | 9 |
LIRE | Open source library for content based image re... | dermotte | Java | [{'topic': {'name': 'image-retrieval'}}, {'top... | # LIRE - Lucene Image Retrieval\nLIRE (Lucene ... | Java | NaN | [image-retrieval, lira, multimedia] | source library for content based image retriev... | retrieval | 12 |
DeOldify | A Deep Learning based project for colorizing a... | jantic | Jupyter Notebook | [] | \n# DeOldify\n\nImage [<img src="https://colab... | Jupyter Notebook | NaN | [] | deep learning based project for colorizing and... | old | 13 |
hashing-baseline-for-image-retrieval | :octocat:Various hashing methods for image ret... | willard-yuan | MATLAB | [{'topic': {'name': 'hashing-library'}}, {'top... | # HABIR Toolkit\n\n[![License](https://img.shi... | MATLAB | NaN | [hashing-library, image-retrieval, ann] | : octocat : various hashing method for image r... | 11 | |
dl-training-datasets | Set of scripts to download datasets of images ... | SaMnCo | Shell | [] | # About this repository\n\n**Notes 2016-01-08*... | Shell | NaN | [] | of script to download datasets of image and cr... | annotation | 17 |
colorization | Automatic colorization using deep neural netwo... | richzhang | Jupyter Notebook | [{'topic': {'name': 'caffe'}}, {'topic': {'nam... | <!--<h3><b>Colorful Image Colorization</b></h3... | Jupyter Notebook | NaN | [caffe, colorization, automatic-colorization, ... | colorization using deep neural network . " col... | neural | 12 |
image_captioning | Tensorflow implementation of "Show, Attend an... | DeepRNN | Python | [] | ### Introduction\nThis neural system for image... | Python | NaN | [] | implementation of " show , attend and tell : n... | caption | 14 |
Colorizing-with-GANs | Grayscale Image Colorization with Generative A... | ImagingLab | Python | [{'topic': {'name': 'deep-learning'}}, {'topic... | # Image Colorization with Generative Adversari... | Python | NaN | [deep-learning, generative-adversarial-network... | image colorization with generative adversarial... | arxiv | 8 |
tencent-ml-images | Largest multi-label image database; ResNet-101... | Tencent | Python | [{'topic': {'name': 'database'}}, {'topic': {'... | # Tencent ML-Images\n\nThis repository introdu... | Python | NaN | [database, deep-learning, computer-vision] | multi - label image database ; resnet - 101 mo... | label | 11 |
DeepNude-an-Image-to-Image-technology | DeepNude's algorithm and general image generat... | yuanxiaosc | Python | [{'topic': {'name': 'image-to-image'}}, {'topi... | # DeepNude-an-Image-to-Image-technology\nGAN e... | Python | NaN | [image-to-image, pix2pix, cycle-gan, dcgan, st... | ' s algorithm and general image generation the... | general\ngeneration\npix\nmodel | 22 |
pytorch-ssd | MobileNetV1, MobileNetV2, VGG based SSD/SSD-li... | qfgaohao | Python | [{'topic': {'name': 'ssd'}}, {'topic': {'name'... | # Single Shot MultiBox Detector Implementation... | Python | NaN | [ssd, pytorch, open-images, object-detection] | , mobilenetv2 , vgg based ssd / ssd - lite imp... | support\nssd\ndataset | 28 |
search_engine.search('picture')
description | owner.login | primaryLanguage.name | repositoryTopics.nodes | object.text | primaryLanguage | object | topics | description_stemmed | description_keywords | description_length | |
---|---|---|---|---|---|---|---|---|---|---|---|
name |
use different representation (word embeddings et c)
change original representation
Latent Semantic Indexing (sklearn.decomposition.TruncatedSVD
)
topic models
Idea - documents are probability distributions over vocabulary
Model documents as mixtures of several latent factors
This can be also considered as soft clustering (and turned into clustering by recovering the biggest component)
I used LDA and NMF here, chose NMF because results looked better
Assume $t$ number of topics.
Find nonnegative $L, T$ minimizing
$\|\underset{n \times v}{D} - \underset{n \times t}{L}\ \underset{t \times D}{T}\|^2_F$
Similar to PCA (rank constraint)
Can also add regularization
starred_repo_df.shape
(459, 11)
import ktrain
num_topics = 10
tm = ktrain.text.get_topic_model(
starred_repo_df['description_stemmed'],
n_topics=num_topics,
model_type='nmf',
n_features=term_document_matrix.shape[1],
lda_max_iter=10,
min_df=1,
verbose=0,
hyperparam_kwargs={'nmf_alpha': 0.01, 'l1_ratio': 0.5, 'ngram_range': (1,1)}
)
using Keras version: 2.2.4-tf
tm.build(starred_repo_df['description_stemmed'])
tm.print_topics(show_counts=True)
topic:2 | count:50 | python using module retrieval http tool algorithm including leveldb implementation topic:4 | count:33 | text summarization model using evaluation document extractive abstractive deep framework topic:8 | count:32 | network neural paper list curated code shot zero repository resource topic:0 | count:29 | learning deep machine shot book interactive scalable source representation model topic:9 | count:27 | data library structure topological science manifold graph point neighborhood notebook topic:3 | count:26 | language processing natural nlp polish art course state list datasets topic:6 | count:22 | image pytorch retrieval implementation based nsfw open training classifier information topic:5 | count:15 | search com vector talk semantic expansion engine query work build topic:7 | count:13 | library machine support causal tree framework regression classification gradient inference topic:1 | count:3 | model task code training semantic similarity achieve paper sentence result
reduced_term_document_matrix = tm.predict(starred_repo_df['description_stemmed'])
representative_repos = [get_repos_representing_topic(starred_repo_df, reduced_term_document_matrix, topic)[['description_stemmed']] for topic in range(num_topics)]
topic_words = tm.get_topics()
for topic in range(num_topics):
printmd("""------\n# Topic {}\n------""".format(topic+1))
show_word_cloud_from_texts(representative_repos[topic]['description_stemmed'])
printmd('# Keywords')
display(set(topic_words[topic].split()))
printmd('## **repositories representative for {}th topic:**'.format(topic + 1))
display(representative_repos[topic])
print()
{'book', 'deep', 'interactive', 'learning', 'machine', 'model', 'representation', 'scalable', 'shot', 'source'}
description_stemmed | |
---|---|
name | |
h2o-3 | source fast scalable machine learning platform... |
vowpal_wabbit | wabbit is a machine learning system which push... |
mxnet-the-straight-dope | interactive book on deep learning . much easy ... |
LearningToCompare_ZSL | code for cvpr 2018 paper : learning to compare... |
d2l-en | into deep learning : an interactive deep learn... |
{'achieve', 'code', 'model', 'paper', 'result', 'semantic', 'sentence', 'similarity', 'task', 'training'}
description_stemmed | |
---|---|
name | |
iclr2016 | code for training all model in the iclr paper ... |
ir-python | python implementation for information retrieva... |
anchor-baggage | code for the article " building topic model ba... |
multifit | code to reproduce result from paper " multifit... |
sentence-similarity | implementation of various deep learning model ... |
{'algorithm', 'http', 'implementation', 'including', 'leveldb', 'module', 'python', 'retrieval', 'tool', 'using'}
description_stemmed | |
---|---|
name | |
ir-python | python implementation for information retrieva... |
data-science-ipython-notebooks | science python notebook : deep learning ( tens... |
boilerpipe3 | fork of boilerpipe with python 3 and small fix... |
gputil | python module for getting the gpu status from ... |
xlearn | performance , easy - to - use , and scalable m... |
{'art', 'course', 'datasets', 'language', 'list', 'natural', 'nlp', 'polish', 'processing', 'state'}
description_stemmed | |
---|---|
name | |
NLP-progress | to track the progress in natural language proc... |
polish-nlp-resources | - trained model and language resource for natu... |
Introduction-to-Natural-Language-Processing-UMich-Coursera | repository contains weekly assignment on imple... |
flair | very simple framework for state - of - the - a... |
nlp-datasets | list of free / public domain datasets with tex... |
{'abstractive', 'deep', 'document', 'evaluation', 'extractive', 'framework', 'model', 'summarization', 'text', 'using'}
description_stemmed | |
---|---|
name | |
tf-textanalysis-gcp | how to perform text preprocessing using bigque... |
nnsum | extractive neural network text summarization l... |
jann | . i am jann . i am text input - text output ch... |
Kashgari | is a production - ready nlp transfer learning ... |
python-sirajnet | deep complicated nlp to turn your text into my... |
{'build', 'com', 'engine', 'expansion', 'query', 'search', 'semantic', 'talk', 'vector', 'work'}
description_stemmed | |
---|---|
name | |
VectorsInSearch | . com repo to accompany the dice . com ' vecto... |
columbiau-rocchio-search-query-expander | rocchio query expansion - similar to " related... |
Kaggle_CrowdFlower | place solution for search result relevance com... |
gnes | is generic neural elastic search , a cloud - n... |
Image-to-Image-Search | reverse image search engine powered by elastic... |
{'based', 'classifier', 'image', 'implementation', 'information', 'nsfw', 'open', 'pytorch', 'retrieval', 'training'}
description_stemmed | |
---|---|
name | |
cnnimageretrieval-pytorch | image retrieval in pytorch : training and eval... |
pytorch-ssd | , mobilenetv2 , vgg based ssd / ssd - lite imp... |
nsfw_data_source_urls | of nsfw image url for the purpose of training ... |
nsfw_data_scraper | of script to aggregate image data for the purp... |
FUNIT_tensorflow | implementation of funit : few - shot unsupervi... |
{'causal', 'classification', 'framework', 'gradient', 'inference', 'library', 'machine', 'regression', 'support', 'tree'}
description_stemmed | |
---|---|
name | |
adversarial-robustness-toolbox | library for adversarial machine learning ( eva... |
catboost | fast , scalable , high performance gradient bo... |
simpletransformers | made simple with training , evaluation , and p... |
dowhy | is a python library for causal inference that ... |
nmslib | - metric space library ( nmslib ): an efficien... |
{'code', 'curated', 'list', 'network', 'neural', 'paper', 'repository', 'resource', 'shot', 'zero'}
description_stemmed | |
---|---|
name | |
ZeroShotCapsule | for paper " zero - shot user intent detection ... |
distiller | network distiller by intel ai lab : a python p... |
awesome-rnn | neural network - a curated list of resource de... |
LearningToCompare_ZSL | code for cvpr 2018 paper : learning to compare... |
Inhibited-softmax | with code for paper " inhibited softmax for un... |
{'data', 'graph', 'library', 'manifold', 'neighborhood', 'notebook', 'point', 'science', 'structure', 'topological'}
description_stemmed | |
---|---|
name | |
topopy | library for computing topological data structu... |
dagster | python library for building data application :... |
data-science-ipython-notebooks | science python notebook : deep learning ( tens... |
industry-machine-learning | curated list of applied machine learning and d... |
ttk | - topological data analysis and visualization ... |
tm.train_recommender(n_neighbors=3, metric='cosine')
def show_results(query):
for res in tm.recommend(query, n=5, n_neighbors=3):
print(res[0])
print()
show_results('search')
and memory - efficient ann with a subset - search functionality simple elasticsearch plugin wrapping around the search endpoint to provide rocchio query expansion query expansion in semantic meta - search engine . the resulting expansion system is called wiki - metasemantik . search engine with query expansion . com repo to accompany the dice . com ' vector in search ' talk by simon hughes , from the activate 2018 search conference , and the ' searching with vector ' talk from haystack 2019 ( u ). build upon my conceptual search and semantic search work from 2015
show_results('query')
and memory - efficient ann with a subset - search functionality simple elasticsearch plugin wrapping around the search endpoint to provide rocchio query expansion rocchio query expansion - similar to " related search :" found at popular search engine but based on relevant document selected by the end - user search engine with query expansion . com repo to accompany the dice . com ' vector in search ' talk by simon hughes , from the activate 2018 search conference , and the ' searching with vector ' talk from haystack 2019 ( u ). build upon my conceptual search and semantic search work from 2015
show_results('information retrieval')
' s algorithm and general image generation theory and practice research , including pix2pix , cyclegan , ugatit , dcgan , singan and vae model ( tensorflow2 implementation ). deepnude的算法以及通用gan图像生成的理论与实践研究 。 image detector with rest interface developed using kera and flask image retrieval in pytorch : training and evaluating cnns for image retrieval in pytorch oxford and paris : large - scale image retrieval benchmarking : octocat : various hashing method for image retrieval and serf a the baseline
Remark: ktrain also has visualization capability but I liked UMAP better
umap_red = umap.UMAP(metric='precomputed')
umap_features = umap_red.fit_transform(metrics.pairwise.cosine_distances(reduced_term_document_matrix, reduced_term_document_matrix))
representatives = pd.concat(representative_repos)
representative_indices = np.where(starred_repo_df.index.isin(representatives.index))
umap_df = pd.DataFrame(umap_features)
umap_df.columns = ['x', 'y']
umap_df['name'] = starred_repo_df.index
umap_df['topic'] = np.argmax(reduced_term_document_matrix, axis=1)
hv.notebook_extension('bokeh','matplotlib')
opts.defaults(
opts.RGB(width=400, height=400, xaxis=None, yaxis=None, show_grid=False, bgcolor="black"))
points = hv.Points(umap_df)
labels = hv.Labels(umap_df, ['x','y'], 'name')
points.opts(
opts.Points(
color='topic',
cmap='Category20',
tools=['zoom_in', 'zoom_out', 'hover'], width=800, height=600),
opts.Overlay(width=800, height=600),
)