# modul importálása
import PyPDF2
# file objektum létrehozása (b: binális megnyítással)
pdfFileObj = open(r'c:\Users\User\Documents\kl\sj_pdf\hozzajarulo_nyilatkozat_vedooltashoz.pdf', 'rb')
# pdf kiolvasó objektum léterhozása
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# lapszám kiírása
print('A pdf file oldal száma :', pdfReader.numPages)
# egy oldal megcímzése
pageObj = pdfReader.getPage(0)
print('---------------pdf tartalma ---------------')
# megcímzett oldal kiolvasása és kiírása
print(pageObj.extractText())
print('---------------pdf információk ---------------')
information = pdfReader.getDocumentInfo()
number_of_pages = pdfReader.getNumPages()
print('pdf létrehozás ideje:', information['/CreationDate'])
txt = f"""
Information about pdf
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
print(txt)
# pdf olvasás lezárása
pdfFileObj.close()
# modul importálása
from tika import parser
# pdf file megnyítása
parsed_pdf = parser.from_file(r'c:\Users\User\Documents\kl\sj_pdf\hozzajarulo_nyilatkozat_vedooltashoz.pdf')
#parsed_pdf = parser.from_file(r'c:\Users\User\Documents\kl\sj_pdf\02_04_21_Firefox_finomsagok.pdf')
#print(parsed_pdf.keys()) ## dict_keys(['metadata', 'content', 'status'])
# pdf tartalom kinyerése szövegként
data = parsed_pdf['content']
print('---------------pdf tartalma ---------------')
# tartalom kiírása
print(data)
print('---------------pdf információk ---------------')
print('pdf létrehozás ideje: ', parsed_pdf['metadata']['Creation-Date'])
print('pdf oldalszám: ', parsed_pdf['metadata']['xmpTPg:NPages'])
print('pdf gen. prg.: ', parsed_pdf['metadata']['xmp:CreatorTool'])
# print(parsed_pdf['metadata'])
# extract_doc_info.py
from PyPDF2 import PdfFileReader
def extract_information(pdf_path):
with open(pdf_path, 'rb') as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
"""
print(txt)
return information
if __name__ == '__main__':
path = r'c:\Users\User\Downloads\06_04_20_USB4.pdf'
extract_information(path)
# pdf tartalom forgatás
from PyPDF4 import PdfFileReader, PdfFileWriter
def rotate_pages(pdf_path):
pdf_writer = PdfFileWriter()
pdf_reader = PdfFileReader(pdf_path)
# elforgatás 90 fokkal jobbra
page_1 = pdf_reader.getPage(0).rotateClockwise(90)
pdf_writer.addPage(page_1)
# elforgatás 90 fokkal balra
page_2 = pdf_reader.getPage(1).rotateCounterClockwise(90)
pdf_writer.addPage(page_2)
# alap helyzetbe
pdf_writer.addPage(pdf_reader.getPage(2))
with open(r'c:\Users\User\Downloads\rotate_pages.pdf', 'wb') as fh:
pdf_writer.write(fh)
if __name__ == '__main__':
path = r'c:\Users\User\Downloads\06_04_20_USB4.pdf'
rotate_pages(path)
# pdf_összefüzés
from PyPDF2 import PdfFileReader, PdfFileWriter
def merge_pdfs(paths, output):
pdf_writer = PdfFileWriter()
for path in paths:
pdf_reader = PdfFileReader(path)
for page in range(pdf_reader.getNumPages()):
# Add each page to the writer object
pdf_writer.addPage(pdf_reader.getPage(page))
# Write out the merged PDF
with open(output, 'wb') as out:
pdf_writer.write(out)
if __name__ == '__main__':
paths = [r'c:\Users\User\Downloads\06_04_20_USB4.pdf', r'c:\Users\User\Downloads\01_03_20_IPTV_Streaming.pdf']
merge_pdfs(paths, output=r'c:\Users\User\Downloads\merged.pdf')
# pdf kivágás / oldalakra szét darabolás
from PyPDF2 import PdfFileReader, PdfFileWriter
def split(path, name_of_split):
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output = f'{name_of_split}{page}.pdf'
with open(output, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
if __name__ == '__main__':
path = r'c:\Users\User\Downloads\06_04_20_USB4.pdf'
split(path, r'c:\Users\User\Downloads\jupyter_page.pdf')
# pdf jelszavas védelem
from PyPDF4 import PdfFileWriter, PdfFileReader
def add_encryption(input_pdf, output_pdf, password):
pdf_writer = PdfFileWriter()
pdf_reader = PdfFileReader(input_pdf)
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
pdf_writer.encrypt(user_pwd=password, owner_pwd=None, use_128bit=True)
with open(output_pdf, 'wb') as fh:
pdf_writer.write(fh)
## Jelszavas megnyítás
# pdf_reader.decrypt(password="Miérdekel?")
# pdf_reader.getPage(0)
if __name__ == '__main__':
add_encryption(input_pdf=r'c:\Users\User\Downloads\06_04_20_USB4.pdf',
output_pdf=r'c:\Users\User\Downloads\06_04_20_USB4_encrypt.pdf',
password='Miérdekel?')
#python verzió lekérése (jupyter-ben)
import sys
from pathlib import Path
!{sys.executable} --version
# Python 3.7.7
print('\n', Path.home())
import PyPDF2
pdfFileObj = open(r'c:\Users\User\Downloads\06_04_20_USB4.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print('oldalszám', pdfReader.numPages)
pageObj = pdfReader.getPage(0)
pageObj.extractText()
pdfReader.getPage(1-1).extractText() ## második oldaltol az utolsóig beolvasás
import PyPDF2, os
# Get all the PDF filenames. / keresd ki összes PDF fájlnevet.
pdfFiles = []
for filename in os.listdir('.'):
if filename.endswith('.pdf'):
pdfFiles.append(filename)
pdfFiles.sort(key=str.lower)
# Loop through all the PDF files. / járjuk végig az összes PDF fájlt.
for filename in pdfFiles:
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# Loop through all the pages (except the first) and add them. / járjuk végig az összes oldalt (az első kivételével), és adja hozzá őket.
for pageNum in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
pdfWriter.addPage(pageObj)
# Save the resulting PDF to a file. / Mentsd el a kapott PDF fájlt.
pdfOutput = open('all_sources.pdf', 'wb')
pdfWriter.write(pdfOutput)
pdfOutput.close()
import os
from os.path import isfile, join
import PyPDF2
"""If running locally, set these variables to your local directories.
"""
pdf_dir = r"c:/Users/User/Documents/kl/sj_pdf"
txt_dir = r"c:/Users/User/Documents/kl/sj_pdf"
"""Note: Uses a generator expression.
Rerun the cell if you restart the loop below.
"""
corpus = (f for f in os.listdir(pdf_dir) if not f.startswith('.') and isfile(join(pdf_dir, f)) and f.endswith('.pdf'))
"""The documentation for PyPDF2 is minimal.
For this pattern, I followed the syntax at
https://automatetheboringstuff.com/chapter13/ and
https://github.com/msaxton/iliff_review/blob/master/code/atla_pdfConvert.py
"""
for filename in corpus:
print(filename)
# Open the PDF and load as PyPDF2 Reader object.
pdf = open(join(pdf_dir, filename), mode='rb')
pdfReader = PyPDF2.PdfFileReader(pdf,strict=False)
# Loop through the pages, extract the text, and write each page to individual file.
for page in range(0, pdfReader.numPages):
pageObj = pdfReader.getPage(page)
text = pageObj.extractText() #.encode('ascii', 'ignore')
# Compile the page name. Add one because Python counts from 0.
page_name = "{}-page{}.txt".format(filename[:-4], page+1)
# Write to each page to file
with open(join(txt_dir, page_name), mode="w", encoding='utf-8') as o:
o.write(text)
"""
try:
pdf_writer = PyPDF4.PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output = name_of_split + '_f'+str(page)+'.pdf'
with open(output, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
except Exception as e:
print('Exception:', output,e,output_pdf)
"""
## Angol karakterek csoportosítása
whitespace = ' \t\n\r\v\f'
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
ascii_letters = ascii_lowercase + ascii_uppercase
digits = '0123456789'
hexdigits = digits + 'abcdef' + 'ABCDEF'
octdigits = '01234567'
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
printable = digits + ascii_letters + punctuation + whitespace