Table of Contents¶

In [1]:

import re
import string
from string import digits
import textract
import pandas as pd

In [2]:

pattern = re.compile(r'[0-9](.*?)\s(Scopus|WoS)', re.DOTALL)

def read_text(filename):
    text = textract.process(filename).replace('.','')
    return ' '.join(text.split())

def format_name(name):
    return filter(str.isalnum, name.translate(None, digits).replace(' ','').lower())

In [3]:

filenames = ['8919877_Journals-1.pdf',
             '6988680_Journals-2.pdf',
             '9047119_Journals-3.pdf',
             '7690152_Journals-4.pdf',
             '3554232_Journals-5.pdf']
pdf_text = []
for filename in filenames:
    pdf_text.append(read_text(filename))

In [4]:

with open('Beall_list_oct2016.txt') as f:
    beall_list_oct = [x.strip() for x in f.readlines()]

with open('Beall_list_dec2016.txt') as f:
    beall_list_dec = [x.strip() for x in f.readlines()]

beall_list_oct_formatted = map(lambda x: format_name(x), beall_list_oct)
beall_list_dec_formatted = map(lambda x: format_name(x), beall_list_dec)

In [5]:

pdf_text_matches = []
for text in pdf_text:
    matches = map(lambda x: format_name(x[0]), pattern.findall(text))
    pdf_text_matches.append(matches)

In [6]:

matching_indices = [] 
for i, item in enumerate(beall_list_dec_formatted):
    for j, pdf_text in enumerate(pdf_text_matches):
        if item in pdf_text:
            matching_indices.append((beall_list_dec[i],filenames[j]))

In [7]:

df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])

In [8]:

df

Out[8]:

	Journal	Source File
0	Actual Problems of Economics	8919877_Journals-1.pdf
1	Aging	8919877_Journals-1.pdf
2	Australasian Medical Journal	8919877_Journals-1.pdf
3	Biosciences, Biotechnology Research Asia	8919877_Journals-1.pdf
4	Der Pharma Chemica	6988680_Journals-2.pdf
5	European Journal of Science and Theology	6988680_Journals-2.pdf
6	European Journal of Social Sciences	6988680_Journals-2.pdf
7	Global Media Journal	6988680_Journals-2.pdf
8	Interdisciplinary Toxicology	6988680_Journals-2.pdf
9	International Journal of Health Research	6988680_Journals-2.pdf
10	International Journal of Network Security	6988680_Journals-2.pdf
11	International Journal of Pharmacognosy	6988680_Journals-2.pdf
12	International Journal of Pharmacy and Technology	8919877_Journals-1.pdf
13	Journal of Animal and Plant Sciences	6988680_Journals-2.pdf
14	Journal of Applied Linguistics	6988680_Journals-2.pdf
15	Journal of Applied Pharmaceutical Science	6988680_Journals-2.pdf
16	Journal of Clinical and Analytical Medicine	6988680_Journals-2.pdf
17	Journal of Environmental Biology	8919877_Journals-1.pdf
18	Journal of Environmental Hydrology	9047119_Journals-3.pdf
19	Journal of Natural Products	9047119_Journals-3.pdf
20	Journal of Physical Therapy Science	9047119_Journals-3.pdf
21	Journal of Software	9047119_Journals-3.pdf
22	Oncoscience	7690152_Journals-4.pdf
23	PharmacologyOnline	7690152_Journals-4.pdf
24	Romanian Biotechnological Letters	7690152_Journals-4.pdf
25	Shiraz E-Medical Journal	7690152_Journals-4.pdf

In [9]:

matching_indices = [] 
for i, item in enumerate(beall_list_oct_formatted):
    for j, pdf_text in enumerate(pdf_text_matches):
        if item in pdf_text:
            matching_indices.append((beall_list_oct[i],filenames[j]))
df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])
df

Out[9]:

	Journal	Source File
0	Science and Education	7690152_Journals-4.pdf