#!/usr/bin/env python # coding: utf-8 # https://thewire.in/102950/predatory-journals-ugc-research/ # In[1]: import re import string from string import digits from fuzzywuzzy import fuzz from fuzzywuzzy import process import textract import pandas as pd # In[2]: pattern = re.compile(r'[0-9](.*?)\s(Scopus|WoS)', re.DOTALL) def read_text(filename): text = textract.process(filename).replace('.','') return ' '.join(text.split()) def format_name(name): return filter(str.isalnum, name.translate(None, digits).replace(' ','').lower()) # In[3]: filenames = ['8919877_Journals-1.pdf', '6988680_Journals-2.pdf', '9047119_Journals-3.pdf', '7690152_Journals-4.pdf', '3554232_Journals-5.pdf'] pdf_text = [] for filename in filenames: pdf_text.append(read_text(filename)) # In[4]: with open('Beall_list_oct2016.txt') as f: beall_list_oct = [x.strip() for x in f.readlines()] with open('Beall_list_dec2016.txt') as f: beall_list_dec = [x.strip() for x in f.readlines()] beall_list_oct_formatted = map(lambda x: format_name(x), beall_list_oct) beall_list_dec_formatted = map(lambda x: format_name(x), beall_list_dec) # In[5]: pdf_text_matches = [] for text in pdf_text: matches = map(lambda x: format_name(x[0]), pattern.findall(text)) pdf_text_matches.append(matches) # In[6]: matching_indices = [] for i, item in enumerate(beall_list_dec_formatted): for j, pdf_text in enumerate(pdf_text_matches): if item in pdf_text: matching_indices.append((beall_list_dec[i],filenames[j])) # In[7]: df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File']) # In[8]: df # In[9]: matching_indices = [] for i, item in enumerate(beall_list_oct_formatted): for j, pdf_text in enumerate(pdf_text_matches): if item in pdf_text: matching_indices.append((beall_list_oct[i],filenames[j])) df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File']) df