#!/usr/bin/env python
# coding: utf-8

# https://thewire.in/102950/predatory-journals-ugc-research/

# In[1]:


import re
import string
from string import digits
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import textract
import pandas as pd


# In[2]:


pattern = re.compile(r'[0-9](.*?)\s(Scopus|WoS)', re.DOTALL)

def read_text(filename):
    text = textract.process(filename).replace('.','')
    return ' '.join(text.split())

def format_name(name):
    return filter(str.isalnum, name.translate(None, digits).replace(' ','').lower())


# In[3]:


filenames = ['8919877_Journals-1.pdf',
             '6988680_Journals-2.pdf',
             '9047119_Journals-3.pdf',
             '7690152_Journals-4.pdf',
             '3554232_Journals-5.pdf']
pdf_text = []
for filename in filenames:
    pdf_text.append(read_text(filename))


# In[4]:


with open('Beall_list_oct2016.txt') as f:
    beall_list_oct = [x.strip() for x in f.readlines()]

with open('Beall_list_dec2016.txt') as f:
    beall_list_dec = [x.strip() for x in f.readlines()]

beall_list_oct_formatted = map(lambda x: format_name(x), beall_list_oct)
beall_list_dec_formatted = map(lambda x: format_name(x), beall_list_dec)


# In[5]:


pdf_text_matches = []
for text in pdf_text:
    matches = map(lambda x: format_name(x[0]), pattern.findall(text))
    pdf_text_matches.append(matches)


# In[6]:


matching_indices = [] 
for i, item in enumerate(beall_list_dec_formatted):
    for j, pdf_text in enumerate(pdf_text_matches):
        if item in pdf_text:
            matching_indices.append((beall_list_dec[i],filenames[j]))


# In[7]:


df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])


# In[8]:


df


# In[9]:


matching_indices = [] 
for i, item in enumerate(beall_list_oct_formatted):
    for j, pdf_text in enumerate(pdf_text_matches):
        if item in pdf_text:
            matching_indices.append((beall_list_oct[i],filenames[j]))
df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])
df