#!/usr/bin/env python # coding: utf-8 # # UGC's 'preferred' journals are still predatory? # # In January 2017, the University Grants Commission of India released [a list of 38,635 Journals](http://www.ugc.ac.in/ugc_notices.aspx?id=1604) that it considers 'genuine' and hence articles published only in these # journals will be used for academic performance evaluation. # # The last list published was scattered across 5 scanned pdf documents. It was almost impossible to search. # [The Wire](https://thewire.in/102950/predatory-journals-ugc-research/) published an analysis of these Journals and found at least 35 of them [Predatory](https://en.wikipedia.org/wiki/Predatory_open_access_publishing) # # # I did an [independent analysis](http://nbviewer.jupyter.org/gist/saketkc/19b3c85d2d6ffe17fda8350256c3c64a) and found only 25. The differing numbers arise from the simple fact that those PDFs are hard to parse. # # The good news is that UGC re-released the list in [text format](http://ugc.ac.in/journallist/) on April 14th, 2017. However, the bad news is there are as many as 82 journals that overlap with Jeffrey Beall's list now. # There probably could be more, since here I just check for an exact match (case-sensitive, equal spaces etc.) # # # All the data and scripts are hosted [here](https://github.com/saketkc/ugc-predatory-journal-analysis) # # This notebook is GPLv3 licensed. Please see [LICENSE](LICENSE) for details. # # Update 10th May 2017: Added a clean function to handle inexact matches. The current tally of Journals is 94 (Merging Jan 2017 and Dec 2016 versions of Beall's list). There are 50 predatory publishers. # In[1]: import pandas as pd from fuzzywuzzy import fuzz import re from fuzzywuzzy import process pd.set_option('display.max_colwidth', -1) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) def clean_name(x): ## Remove things in braces such as : ## Journal of Food, Agriculture and Environment (JFAE) => Journal of Food, Agriculture and Environment cleaned = re.sub(r"\(([A-Za-z0-9_]+)\)", '', x).strip() cleaned = re.sub( '\s+', ' ', cleaned ).strip() cleaned = unicode(cleaned.lower().replace('&', 'and'), 'utf-8') return cleaned # In[2]: with open('Beall_list_dec2016.txt') as f: beall_list_dec = [clean_name(x.strip()) for x in f.readlines()] df = pd.read_csv('UGC_Journal_list_2017.csv') df.Title = df.Title.apply(clean_name) df.Publisher = df.Publisher.astype(str).apply(clean_name) # In[3]: exact_matches_dec2016 = pd.DataFrame({'Journals': list(set(df.Title).intersection(beall_list_dec))}) #fuzzy_matches = [process.extractOne(x, beall_list_dec) for x in set(df.Title)] # In[4]: exact_matches_dec2016.sort_values(by='Journals').reset_index(drop=True) # ## Update # Update for Beall's list [Jan 2017 version](https://web.archive.org/web/20170111172309/https://scholarlyoa.com/individual-journals/). For some reason, the total count reduces to 43. # # In[5]: with open('Beall_list_Jan2017.txt') as f: beall_list_jan = [clean_name(x.strip()) for x in f.readlines()] exact_matches_jan2017 = pd.DataFrame({'Journals': list(set(df.Title).intersection(beall_list_jan))}) exact_matches_jan2017.sort_values(by='Journals').reset_index(drop=True) # ## Union of Dec2016 and Jan2017 version overlapping UGC's list # In[6]: overlap_jan_dec = pd.concat([exact_matches_dec2016, exact_matches_jan2017]).drop_duplicates().sort_values(by='Journals').reset_index(drop=True) overlap_jan_dec # ## Total Journals in Beall's list: # 1250 Beall_list_dec2016.txt # # 1310 Beall_list_Jan2017.txt # # 1112 Beall_list_oct2016.txt # ## Publisher list overlap # # ### Dec 2016 version # In[7]: with open('Beall_publisher_list_Dec2016.txt') as f: beall_pubisher_list_dec = [clean_name(x.strip()) for x in f.readlines()] publisher_matches_dec2016 = pd.DataFrame({'Publisher': list(set(df.Publisher).intersection(beall_pubisher_list_dec))}) publisher_matches_dec2016.sort_values(by='Publisher').reset_index(drop=True) # ### Jan 2017 version # In[8]: with open('Beall_publisher_list_Jan2017.txt') as f: beall_pubisher_list_jan = [clean_name(x.strip()) for x in f.readlines()] publisher_matches_jan2017 = pd.DataFrame({'Publisher': list(set(df.Publisher).intersection(beall_pubisher_list_jan))}) publisher_matches_jan2017.sort_values(by='Publisher').reset_index(drop=True) # # Union of Publisher List from Dec 2016 and Jan 2017 # In[9]: overlap_pub_jan_dec = pd.concat([publisher_matches_dec2016, publisher_matches_jan2017]).drop_duplicates().sort_values(by='Publisher').reset_index(drop=True) overlap_pub_jan_dec # In[ ]: