#!/usr/bin/env python # coding: utf-8 # Open In Colab #

# # #

# # #

# # # PDF használata pythonnal. # # ## PyPDF2, PyPDF4 csomag # # ## tika csomag # # ## pdfplumber csomag # # # # https://klajosw.blogspot.com/ # # https://github.com/klajosw/ # # --- # # # # Telepítés # # ## Anaconda # # conda install PyPDF2 # conda install tika # conda install pdfplumber # # ## Python # # python3 -m pip install PyPDF2 # python3 -m pip show PyPDF2 # # python3 -m pip install tika # python3 -m pip install pdfplumber # # ### Állományból telepítés # # python3 -m pip install c:\Users\User\Downloads\PyPDF4-1.27.0.tar.gz # # ---- # # In[ ]: # modul importálása import PyPDF2 # file objektum létrehozása (b: binális megnyítással) pdfFileObj = open(r'c:\Users\User\Documents\kl\sj_pdf\hozzajarulo_nyilatkozat_vedooltashoz.pdf', 'rb') # pdf kiolvasó objektum léterhozása pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # lapszám kiírása print('A pdf file oldal száma :', pdfReader.numPages) # egy oldal megcímzése pageObj = pdfReader.getPage(0) print('---------------pdf tartalma ---------------') # megcímzett oldal kiolvasása és kiírása print(pageObj.extractText()) print('---------------pdf információk ---------------') information = pdfReader.getDocumentInfo() number_of_pages = pdfReader.getNumPages() print('pdf létrehozás ideje:', information['/CreationDate']) txt = f""" Information about pdf Author: {information.author} Creator: {information.creator} Producer: {information.producer} Subject: {information.subject} Title: {information.title} Number of pages: {number_of_pages} """ print(txt) # pdf olvasás lezárása pdfFileObj.close() # In[ ]: # modul importálása from tika import parser # pdf file megnyítása parsed_pdf = parser.from_file(r'c:\Users\User\Documents\kl\sj_pdf\hozzajarulo_nyilatkozat_vedooltashoz.pdf') #parsed_pdf = parser.from_file(r'c:\Users\User\Documents\kl\sj_pdf\02_04_21_Firefox_finomsagok.pdf') #print(parsed_pdf.keys()) ## dict_keys(['metadata', 'content', 'status']) # pdf tartalom kinyerése szövegként data = parsed_pdf['content'] print('---------------pdf tartalma ---------------') # tartalom kiírása print(data) print('---------------pdf információk ---------------') print('pdf létrehozás ideje: ', parsed_pdf['metadata']['Creation-Date']) print('pdf oldalszám: ', parsed_pdf['metadata']['xmpTPg:NPages']) print('pdf gen. prg.: ', parsed_pdf['metadata']['xmp:CreatorTool']) # print(parsed_pdf['metadata']) # In[ ]: # In[ ]: # extract_doc_info.py from PyPDF2 import PdfFileReader def extract_information(pdf_path): with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) information = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() txt = f""" Information about {pdf_path}: Author: {information.author} Creator: {information.creator} Producer: {information.producer} Subject: {information.subject} Title: {information.title} Number of pages: {number_of_pages} """ print(txt) return information if __name__ == '__main__': path = r'c:\Users\User\Downloads\06_04_20_USB4.pdf' extract_information(path) # In[ ]: # pdf tartalom forgatás from PyPDF4 import PdfFileReader, PdfFileWriter def rotate_pages(pdf_path): pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(pdf_path) # elforgatás 90 fokkal jobbra page_1 = pdf_reader.getPage(0).rotateClockwise(90) pdf_writer.addPage(page_1) # elforgatás 90 fokkal balra page_2 = pdf_reader.getPage(1).rotateCounterClockwise(90) pdf_writer.addPage(page_2) # alap helyzetbe pdf_writer.addPage(pdf_reader.getPage(2)) with open(r'c:\Users\User\Downloads\rotate_pages.pdf', 'wb') as fh: pdf_writer.write(fh) if __name__ == '__main__': path = r'c:\Users\User\Downloads\06_04_20_USB4.pdf' rotate_pages(path) # In[ ]: # pdf_összefüzés from PyPDF2 import PdfFileReader, PdfFileWriter def merge_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF with open(output, 'wb') as out: pdf_writer.write(out) if __name__ == '__main__': paths = [r'c:\Users\User\Downloads\06_04_20_USB4.pdf', r'c:\Users\User\Downloads\01_03_20_IPTV_Streaming.pdf'] merge_pdfs(paths, output=r'c:\Users\User\Downloads\merged.pdf') # In[ ]: # pdf kivágás / oldalakra szét darabolás from PyPDF2 import PdfFileReader, PdfFileWriter def split(path, name_of_split): pdf = PdfFileReader(path) for page in range(pdf.getNumPages()): pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf.getPage(page)) output = f'{name_of_split}{page}.pdf' with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf) if __name__ == '__main__': path = r'c:\Users\User\Downloads\06_04_20_USB4.pdf' split(path, r'c:\Users\User\Downloads\jupyter_page.pdf') # In[ ]: # pdf jelszavas védelem from PyPDF4 import PdfFileWriter, PdfFileReader def add_encryption(input_pdf, output_pdf, password): pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(input_pdf) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(user_pwd=password, owner_pwd=None, use_128bit=True) with open(output_pdf, 'wb') as fh: pdf_writer.write(fh) ## Jelszavas megnyítás # pdf_reader.decrypt(password="Miérdekel?") # pdf_reader.getPage(0) if __name__ == '__main__': add_encryption(input_pdf=r'c:\Users\User\Downloads\06_04_20_USB4.pdf', output_pdf=r'c:\Users\User\Downloads\06_04_20_USB4_encrypt.pdf', password='Miérdekel?') # In[ ]: #python verzió lekérése (jupyter-ben) import sys from pathlib import Path get_ipython().system('{sys.executable} --version') # Python 3.7.7 print('\n', Path.home()) # In[ ]: import PyPDF2 pdfFileObj = open(r'c:\Users\User\Downloads\06_04_20_USB4.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) print('oldalszám', pdfReader.numPages) pageObj = pdfReader.getPage(0) pageObj.extractText() pdfReader.getPage(1-1).extractText() ## második oldaltol az utolsóig beolvasás # In[ ]: import PyPDF2, os # Get all the PDF filenames. / keresd ki összes PDF fájlnevet. pdfFiles = [] for filename in os.listdir('.'): if filename.endswith('.pdf'): pdfFiles.append(filename) pdfFiles.sort(key=str.lower) # Loop through all the PDF files. / járjuk végig az összes PDF fájlt. for filename in pdfFiles: pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # Loop through all the pages (except the first) and add them. / járjuk végig az összes oldalt (az első kivételével), és adja hozzá őket. for pageNum in range(1, pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) # Save the resulting PDF to a file. / Mentsd el a kapott PDF fájlt. pdfOutput = open('all_sources.pdf', 'wb') pdfWriter.write(pdfOutput) pdfOutput.close() # In[ ]: import os from os.path import isfile, join import PyPDF2 """If running locally, set these variables to your local directories. """ pdf_dir = r"c:/Users/User/Documents/kl/sj_pdf" txt_dir = r"c:/Users/User/Documents/kl/sj_pdf" """Note: Uses a generator expression. Rerun the cell if you restart the loop below. """ corpus = (f for f in os.listdir(pdf_dir) if not f.startswith('.') and isfile(join(pdf_dir, f)) and f.endswith('.pdf')) """The documentation for PyPDF2 is minimal. For this pattern, I followed the syntax at https://automatetheboringstuff.com/chapter13/ and https://github.com/msaxton/iliff_review/blob/master/code/atla_pdfConvert.py """ for filename in corpus: print(filename) # Open the PDF and load as PyPDF2 Reader object. pdf = open(join(pdf_dir, filename), mode='rb') pdfReader = PyPDF2.PdfFileReader(pdf,strict=False) # Loop through the pages, extract the text, and write each page to individual file. for page in range(0, pdfReader.numPages): pageObj = pdfReader.getPage(page) text = pageObj.extractText() #.encode('ascii', 'ignore') # Compile the page name. Add one because Python counts from 0. page_name = "{}-page{}.txt".format(filename[:-4], page+1) # Write to each page to file with open(join(txt_dir, page_name), mode="w", encoding='utf-8') as o: o.write(text) """ try: pdf_writer = PyPDF4.PdfFileWriter() pdf_writer.addPage(pdf.getPage(page)) output = name_of_split + '_f'+str(page)+'.pdf' with open(output, 'wb') as output_pdf: pdf_writer.write(output_pdf) except Exception as e: print('Exception:', output,e,output_pdf) """ # In[ ]: ## Angol karakterek csoportosítása whitespace = ' \t\n\r\v\f' ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz' ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' ascii_letters = ascii_lowercase + ascii_uppercase digits = '0123456789' hexdigits = digits + 'abcdef' + 'ABCDEF' octdigits = '01234567' punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" printable = digits + ascii_letters + punctuation + whitespace # In[ ]: