from whoosh.fields import Schema

from whoosh.fields import ID, KEYWORD, TEXT

pdf_schema = Schema(id = ID(unique=True, stored=True), 
                    path = ID(stored=True), 
                    source = ID(stored=True),
                    author = TEXT(stored=True), 
                    title = TEXT(stored=True),
                    text = TEXT)

cd pydf/

import os
from whoosh.index import create_in

if not os.path.exists("pdf-index"):
    os.mkdir("pdf-index")
    index = create_in("pdf-index", pdf_schema)

from whoosh.index import open_dir

index = open_dir("pdf-index")

writer = index.writer()

writer.add_document(id = 'blei2003', 
                    path = 'data/blei2003.txt',
                    source = 'static/pdfs/blei2003.pdf',
                    author = 'David Blei, Andrew Ng, Michael Jordan',
                    title = 'Latent Dirichlet Allocation',
                    text = open('data/blei2003.txt', encoding='utf-8').read())

writer.add_document(id = 'goodwyn2013', 
                    path = 'data/goodwyn2013.txt',
                    source = 'static/pdfs/goodwyn2013.pdf',
                    author = 'Erik Goodwyn',
                    title = 'Recurrent motifs as resonant attractor states in the narrative ﬁeld: a testable model of archetype',
                    text = open('data/goodwyn2013.txt', encoding='utf-8').read())

writer.add_document(id = 'meij2009', 
                    path = 'data/meij2009.txt',
                    source = 'static/pdfs/meij2009.pdf',
                    author = 'Edgar Meij, Dolf Trieschnigg, Maarten de Rijke, Wessel Kraaij',
                    title = 'Conceptual language models for domain-speciﬁc retrieval',
                    text = open('data/meij2009.txt', encoding='utf-8').read())

writer.add_document(id = 'muellner2011', 
                    path = 'data/muellner2011.txt',
                    source = 'static/pdfs/muellner2011.pdf',
                    author = 'David Muellner',
                    title = 'Modern hierarchical, agglomerative clustering algorithms',
                    text = open('data/muellner2011.txt', encoding='utf-8').read())

writer.commit()

searcher = index.searcher()

from whoosh.query import Term, And

query = And([Term("text", "model"), Term("text", "topic")])

results = searcher.search(query)
print('Number of hits:', len(results))
print('Best hit:', results[0])

from whoosh.query import Or 

# insert your code here

# insert your code here

from whoosh.qparser import QueryParser

parser = QueryParser("text", index.schema)

parser.parse("probability model prior")

parser.parse("(cluster OR grouping) AND (model OR schema)")

parser.parse("topic index author:'Dolf Trieschnigg'")

parser.parse("clust*")

# insert your code here

import subprocess

subprocess.call(['ls', '-l'])

subprocess.call(['pdftotext', 'pdfs/blei2003.pdf', 'data/blei2003.txt'])

subprocess.call(['pdftotext', '-enc', 'UTF-8', 'pdfs/blei2003.pdf', 'data/blei2003.txt'])

import os

def pdftotext(pdf):
    # insert your code here

# if your answer is correct this should print the first 1000 bytes of the text file
pdftotext("pdfs/blei2003.pdf")
with open(os.path.join('data', 'blei2003.txt')) as infile:
    print(infile.read(1000))

subprocess.call(['pdftotext', '-htmlmeta', '-enc', 'UTF-8', 
                 'pdfs/muellner2011.pdf', 'data/muellner2011.html'])

print(open('data/muellner2011.html').read(500))

from bs4 import BeautifulSoup

def parse_html(filename):
    """Extract the Author, Title and Text from a HTML file
    which was produced by pdftotext with the option -htmlmeta."""
    with open(filename) as infile:
        html = BeautifulSoup(infile, "html.parser", from_encoding='utf-8')
        d = {'text': html.pre.text}
        if html.title is not None:
            d['title'] = html.title.text
        for meta in html.findAll('meta'):
            try:
                if meta['name'] in ('Author', 'Title'):
                    d[meta['name'].lower()] = meta['content']
            except KeyError:
                continue
        return d
    
parse_html('data/muellner2011.html')

def pdftotext(pdf):
    """Convert a pdf to a text file. Extract the Author and Title 
    and return a dictionary consisting of the author, title and 
    text."""
    basename, _ = os.path.splitext(os.path.basename(pdf))
    # insert your code here

import shutil

def pdftotext(pdf):
    """Convert a pdf to a text file. Extract the Author and Title 
    and return a dictionary consisting of the author, title, text
    the source path, the path of the converted text file and the 
    file ID."""
    basename, _ = os.path.splitext(os.path.basename(pdf))
    subprocess.call(['pdftotext', '-enc', 'UTF-8', '-htmlmeta',
                     pdf, os.path.join('data', basename + '.html')])
    data = parse_html(os.path.join('data', basename + '.html'))
    with open(os.path.join('data', basename + '.txt'), 'w') as outfile:
        outfile.write(data['text'])
    # insert your code here

pdftotext("pdfs/muellner2011.pdf")

import configparser

config = configparser.ConfigParser()
config.read('pydf.ini')
config.sections()

config['filepaths']['pdf directory']

from os.path import basename, splitext


def fileid(filepath):
    """
    Return the basename of a file without its extension.
    >>> fileid('/some/path/to/a/file.pdf')
    file
    """
    base, _ = splitext(basename(filepath))
    return base


def pdftotext(pdf, outdir='.', sourcedir='source', p2t='pdftotext', move=False):
    """Convert a pdf to a text file. Extract the Author and Title 
    and return a dictionary consisting of the author, title, text
    the source path, the path of the converted text file and the 
    file ID."""    
    filename = fileid(pdf)
    htmlpath = os.path.join(outdir, filename + '.html')
    txtpath = os.path.join(outdir, filename + '.txt')
    if not os.path.exists(sourcedir):
        os.mkdir(sourcedir)
    sourcepath = os.path.join(sourcedir, filename + '.pdf')
    subprocess.call([p2t, '-enc', 'UTF-8', '-htmlmeta', pdf, htmlpath])
    data = parse_html(htmlpath)
    os.remove(htmlpath)
    file_action = shutil.move if move else shutil.copy
    file_action(pdf, sourcepath)
    with open(txtpath, 'w') as outfile:
        outfile.write(data['text'])
    data['source'] = sourcepath
    data['path'] = txtpath
    data['id'] = fileid(pdf)
    return data

pdftotext("pdfs/blei2003.pdf", 
          outdir=config.get('filepaths', 'txt directory'),
          sourcedir=config.get('filepaths', 'source directory'),
          move=config.getboolean('indexer.options', 'move pdfs'))

import glob

def index_collection(configpath):
    "Main routine to index a collection of PDFs using Whoosh."
    config = configparser.ConfigParser()
    # read the configuration file
    # insert your code here
    
    recompile = config.getboolean("indexer.options", "recompile")
    # check whether the supplied index directory already exists
    if not os.path.exists(config.get("filepaths", "index directory")):
        # if not, create a new directory and initialize the index
        os.mkdir(config.get("filepaths", "index directory"))
        index = create_in(config.get("filepaths", "index directory"), schema=pdf_schema)
        recompile = True
    # open a connection to the index
    index = # insert your code here
    
    # retrieve a set of all file IDs we already indexed
    indexed = set(map(fileid, os.listdir(config.get("filepaths", "txt directory"))))
    # initialize a IndexWriter object
    writer = # insert your code here
    
    # iterate over all directories 
    for directory in config.get("filepaths", "pdf directory").split(';'):
        # iterate over all PDF files in this directory
        for filepath in glob.glob(directory + "/*.pdf"):
            # poor man's solution to check whether we already indexed this pdf
            if fileid(filepath) not in indexed or recompile:
                try:
                    # call the function pdftotext with the correct arguments
                    data = # insert your code here
                    
                    # add the new document to the index
                    writer.add_document(**data)
                except (IOError, UnicodeDecodeError) as error:
                    print(error)
    # commit our changes
    # insert your code here

index_collection('pydf.ini')

from flask import Flask
app = Flask(__name__)

@app.route("/")
def hello():
    return "Hello World!"

if __name__ == "__main__":
    app.run(port=5000)

from flask import Flask, render_template

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

if __name__ == '__main__':
    app.run(debug=True, host='localhost', port=8000, use_reloader=True, threaded=True)

from whoosh.index import open_dir
from whoosh.qparser import QueryParser

def search(query):
    # insert your code here
    
print(list(search("(topic model) OR (index probability")))

def search(query):
    # insert your code here
    
print(list(search("(topic model) OR (index probability")))

def to_html(result):
    "Return a representation of a search result in HTML."
    title = result['title'] if 'title' in result else result['id']
    author = result['author'] if 'author' in result else ''
    html = """
        <div id='match'>
          <span id='id'>
            <a href='%s' target='_blank'>%s</a>
          </span>
          <span id='author'>%s</span>
          </br>
          <span id='text'>%s</span>
        </div>
           """ % (result['source'], title, author, result['snippet'])
    return html

print(to_html(next(search("topic model"))))

from flask import request, jsonify

@app.route('/searchbox', methods=['POST'])
def searchbox():
    query = request.form['q'].strip()
    html_results = '\n'.join(map(to_html, search(query)))
    return jsonify({'html': html_results})

from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()