温馨提示:本文翻译自stackoverflow.com,查看原文请点击:python - How do I Make a PDF searchable for a flask search application?
flask python search

python - 如何使烧瓶搜索应用程序可搜索PDF?

发布于 2020-04-13 12:36:44

我一直在为一个非常重要的个人项目做研究。我想创建一个Flask搜索应用程序,使我可以搜索100个以上的PDF文件中的内容。我发现了一些与flask一起使用的ElasticSearch Lib信息。

#!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import json
from flask import Flask, jsonify, request, render_template,  json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)

# create a new PDF object with FPDF
pdf = FPDF()

# use an iterator to create 10 pages
for page in range(10):
    pdf.add_page()
    pdf.set_font("Arial", size=14)
    pdf.cell(150, 12, txt="Object Rocket ROCKS!!", ln=1, align="C")

# output all of the data to a new PDF file
pdf.output("object_rocket.pdf")

'''
read_pdf = PyPDF2.PdfFileReader("object_rocket.pdf")
page = read_pdf.getPage(0)
page_mode = read_pdf.getPageMode()
page_text = page.extractText()
print (type(page_text))
'''
#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
    print (meta, value)
    all_pages["meta"][meta] = value

# iterate the page numbers
for page in range(num):
    data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()

    # extract the page's text
    page_text = data.extractText()

    # put the text data into the dict
    all_pages[page] = page_text

# create a JSON string from the dictionary
json_data = json.dumps(all_pages)
#print ("\nJSON:", json_data)

# convert JSON string to bytes-like obj
bytes_string = bytes(json_data, 'utf-8')
#print ("\nbytes_string:", bytes_string)

# convert bytes to base64 encoded string
encoded_pdf = base64.b64encode(bytes_string)
encoded_pdf = str(encoded_pdf)
#print ("\nbase64:", encoded_pdf)

# put the PDF data into a dictionary body to pass to the API request
body_doc = {"data": encoded_pdf}

# call the index() method to index the data
result = elastic_client.index(index="pdf", doc_type="_doc", id="42", body=body_doc)

# print the returned sresults
#print ("\nindex result:", result['result'])

# make another Elasticsearch API request to get the indexed PDF
result = elastic_client.get(index="pdf", doc_type='_doc', id=42)

# print the data to terminal
result_data = result["_source"]["data"]
#print ("\nresult_data:", result_data, '-- type:', type(result_data))

# decode the base64 data (use to [:] to slice off
# the 'b and ' in the string)
decoded_pdf = base64.b64decode(result_data[2:-1]).decode("utf-8")
#print ("\ndecoded_pdf:", decoded_pdf)

# take decoded string and make into JSON object
json_dict = json.loads(decoded_pdf)
#print ("\njson_str:", json_dict, "\n\ntype:", type(json_dict))
result2 = elastic_client.index(index="pdftext", doc_type="_doc", id="42", body=json_dict)

# create new FPDF object
pdf = FPDF()

# build the new PDF from the Elasticsearch dictionary
# Use 'iteritems()` instead of 'items()' for Python 2
""" for page, value in json_data:
    if page != "meta":
        # create new page
        pdf.add_page()
        pdf.set_font("Arial", size=14)

        # add content to page
        output = value + " -- Page: " + str(int(page)+1)
        pdf.cell(150, 12, txt=output, ln=1, align="C")
    else:
        # create the meta data for the new PDF
        for meta, meta_val in json_dict["meta"].items():
            if "title" in meta.lower():
                pdf.set_title(meta_val)
            elif "producer" in meta.lower() or "creator" in meta.lower():
                pdf.set_creator(meta_val)
 """
# output the PDF object's data to a PDF file
#pdf.output("object_rocket_from_elaticsearch.pdf" )

@app.route('/', methods=['GET'])
def index():

    return jsonify(json_dict)

@app.route('/<id>', methods=['GET'])
def index_by_id(id):

    return jsonify(json_dict[id])


""" @app.route('/insert_data', methods=['PUT'])
def insert_data():
    slug = request.form['slug']
    title = request.form['title']
    content = request.form['content']

    body = {
        'slug': slug,
        'title': title,
        'content': content,
        'timestamp': datetime.now()
    }

    result = es.index(index='contents', doc_type='title', id=slug, body=body)

    return jsonify(result) """



app.run(port=5003, debug=True)

------进展------我现在有一个没有前端搜索功能的可行解决方案:

# Load_single_PDF_BY_PAGE_TO_index.py
  #!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64

from flask import Flask, jsonify, request, render_template,  json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)


#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
    print (meta, value)
    all_pages["meta"][meta] = value

x = 44
# iterate the page numbers
for page in range(num):
    data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()

    # extract the page's text
    page_text = data.extractText()

    # put the text data into the dict
    all_pages[page] = page_text

    body_doc2 = {"data": page_text}
    result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
    x += 1

上面的代码按页面将单个pdf加载到elasticsearch中。

from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")

app = Flask(__name__)

@app.route('/pdf', methods=['GET'])
def index():
    results = es.get(index='pdfclearn', doc_type='_doc', id='44')
    return jsonify(results['_source'])


@app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
    results = es.get(index='pdfclearn', doc_type='_doc', id=id)
    return jsonify(results['_source'])



@app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
    keyword = keyword

    body = {
        "query": {
            "multi_match": {
                "query": keyword,
                "fields": ["data"]
            }
        }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

@app.route("/searhbar")
def searhbar():
    return render_template("index.html")

@app.route("/searhbar/<string:box>")
def process(box):
    query = request.args.get('query')
    if box == 'names':
         keyword = box

    body = {
        "query": {
            "multi_match": {
                "query": keyword,
                "fields": ["data"]
            }
        }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

app.run(port=5003, debug=True)

在上面的代码中,我们可以在所有页面上搜索关键字或词组。

curl http://127.0.0.1:5003/search/test //it works!!

我找到了一个博客,介绍如何在ElasticSearch中将PDF文件作为Base64索引。我已经看到DocuSign的API可以这样做以进行文档模板化。但是,我不了解如何以可搜索ElasticSearch的方式来Jsonify Base64 PDF。

curl "http://localhost:9200/pdftext/_doc/42"

curl -X POST "http://localhost:9200/pdf/_search?q=*"

我可以检索700页文档的Base64。但是我认为我需要索引并检索文档的每一页。

我研究过的博客使我有所作为:

结束游戏:

我将继续研究Elastic Search和Base64编码和解码。但我希望获得一些帮助,以实现自己的目标。任何详细的示例将不胜感激。

查看更多

提问者
BlackFox
被浏览
55
BlackFox 2020-02-17 14:39

因此,我找到了一个名为scout的库,并使其正常工作!

from scout_client import Scout

# import libraries to help read and create PDF

import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template,  json

client = Scout('http://localhost:8000')

for k in range(7,18):
    read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
    num = read_pdf.getNumPages()
    print ("PDF pages:", num)
    all_pages = []
    for page in range(num):
        data = read_pdf.getPage(page) 
        page_text = data.extractText()  
        all_pages.append(page_text)

    import requests
    for z in all_pages:
        url = 'http://localhost:8000/documents/'
        data = {'content': z, 'indexes': ['test13']}
        headers = {
        'Content-Type': 'application/json',
        }

        response = requests.post(url, data=json.dumps(data), headers=headers)

    print(response)
  • 我现在可以在本地循环遍历任意数量的PDF
  • 发布到服务器以建立索引
  • 并搜索关键字

现在,我只需要使用搜索栏制作基本前端的帮助,该搜索栏可以通过python和flask的JSON响应调用数据。