import re
import string
from string import digits
import textract
import pandas as pd
pattern = re.compile(r'[0-9](.*?)\s(Scopus|WoS)', re.DOTALL)
def read_text(filename):
text = textract.process(filename).replace('.','')
return ' '.join(text.split())
def format_name(name):
return filter(str.isalnum, name.translate(None, digits).replace(' ','').lower())
filenames = ['8919877_Journals-1.pdf',
'6988680_Journals-2.pdf',
'9047119_Journals-3.pdf',
'7690152_Journals-4.pdf',
'3554232_Journals-5.pdf']
pdf_text = []
for filename in filenames:
pdf_text.append(read_text(filename))
with open('Beall_list_oct2016.txt') as f:
beall_list_oct = [x.strip() for x in f.readlines()]
with open('Beall_list_dec2016.txt') as f:
beall_list_dec = [x.strip() for x in f.readlines()]
beall_list_oct_formatted = map(lambda x: format_name(x), beall_list_oct)
beall_list_dec_formatted = map(lambda x: format_name(x), beall_list_dec)
pdf_text_matches = []
for text in pdf_text:
matches = map(lambda x: format_name(x[0]), pattern.findall(text))
pdf_text_matches.append(matches)
matching_indices = []
for i, item in enumerate(beall_list_dec_formatted):
for j, pdf_text in enumerate(pdf_text_matches):
if item in pdf_text:
matching_indices.append((beall_list_dec[i],filenames[j]))
df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])
df
Journal | Source File | |
---|---|---|
0 | Actual Problems of Economics | 8919877_Journals-1.pdf |
1 | Aging | 8919877_Journals-1.pdf |
2 | Australasian Medical Journal | 8919877_Journals-1.pdf |
3 | Biosciences, Biotechnology Research Asia | 8919877_Journals-1.pdf |
4 | Der Pharma Chemica | 6988680_Journals-2.pdf |
5 | European Journal of Science and Theology | 6988680_Journals-2.pdf |
6 | European Journal of Social Sciences | 6988680_Journals-2.pdf |
7 | Global Media Journal | 6988680_Journals-2.pdf |
8 | Interdisciplinary Toxicology | 6988680_Journals-2.pdf |
9 | International Journal of Health Research | 6988680_Journals-2.pdf |
10 | International Journal of Network Security | 6988680_Journals-2.pdf |
11 | International Journal of Pharmacognosy | 6988680_Journals-2.pdf |
12 | International Journal of Pharmacy and Technology | 8919877_Journals-1.pdf |
13 | Journal of Animal and Plant Sciences | 6988680_Journals-2.pdf |
14 | Journal of Applied Linguistics | 6988680_Journals-2.pdf |
15 | Journal of Applied Pharmaceutical Science | 6988680_Journals-2.pdf |
16 | Journal of Clinical and Analytical Medicine | 6988680_Journals-2.pdf |
17 | Journal of Environmental Biology | 8919877_Journals-1.pdf |
18 | Journal of Environmental Hydrology | 9047119_Journals-3.pdf |
19 | Journal of Natural Products | 9047119_Journals-3.pdf |
20 | Journal of Physical Therapy Science | 9047119_Journals-3.pdf |
21 | Journal of Software | 9047119_Journals-3.pdf |
22 | Oncoscience | 7690152_Journals-4.pdf |
23 | PharmacologyOnline | 7690152_Journals-4.pdf |
24 | Romanian Biotechnological Letters | 7690152_Journals-4.pdf |
25 | Shiraz E-Medical Journal | 7690152_Journals-4.pdf |
matching_indices = []
for i, item in enumerate(beall_list_oct_formatted):
for j, pdf_text in enumerate(pdf_text_matches):
if item in pdf_text:
matching_indices.append((beall_list_oct[i],filenames[j]))
df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])
df
Journal | Source File | |
---|---|---|
0 | Science and Education | 7690152_Journals-4.pdf |