PDF Support, Description, & image acquisition needs work.

2026-06-26 23:11:38 -04:00 · 2020-12-12 13:24:17 -05:00
parent 11fe1c7d40
commit 7784f12c29
3 changed files with 79 additions and 13 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,3 +26,4 @@ ptvsd
 pudb
 daphne
 whitenoise
 pypdf2
--- a/src/backend/lib/api_hooks.py
+++ b/src/backend/lib/api_hooks.py
@@ -1,8 +1,6 @@
 #!/usr/bin/python
 import sys
 import requests
-
+import json
 # sys.path.insert(1, 'lib/')
@@ -33,3 +31,22 @@ class DuckDuckGo:
            return image.raw
        else:
            return False
    def description_result(self, query):
        _key = "&format=json"
        try:
            query = query.string
        except AttributeError:
            pass
        try:
            _r = json.loads(requests.get(self.url + query + _key).text)
        except Exception as e:
            return None
        if len(_r["Results"]) == 0:
            return None
        else:
            try:
                return _r["Abstract"]
            except AttributeError:
                return None
            return _r.Results[0]
--- a/src/backend/lib/library.py
+++ b/src/backend/lib/library.py
@@ -1,15 +1,13 @@
 #!/usr/bin/env python
 import json
 import os
 import pathlib
 import re
 import zipfile
 import PyPDF2
 from bs4 import BeautifulSoup
 from mobi import Mobi
 from .api_hooks import DuckDuckGo
 from .config import Config
 from .storage import Storage
@@ -57,16 +55,11 @@ class Catalogue:
        :returns self._book_list_expanded: json string containing all book metadata
        """
        self.scan_folder()  # Populate file list
-        regx = re.compile(r"\.epub|\.mobi")
+        regx = re.compile(r"\.epub|\.mobi|\.pdf")
        try:
            self.books = list(filter(regx.search, filter(None, self.file_list)))
        except TypeError as e:
            self.config.logger.error(e)
        """
        for book in self.books:
            self._book_list_expanded[book] = self.process_by_filetype(book)
        return self._book_list_expanded
        """
    def process_by_filetype(self, book):
        if book.endswith(".epub"):
@@ -74,6 +67,8 @@ class Catalogue:
            return self.extract_metadata_epub(epub)
        elif book.endswith(".mobi"):
            return self.extract_metadata_mobi(book)
        elif book.endswith(".pdf"):
            return self.extract_metadata_pdf(book)
    @staticmethod
    def process_epub(book):
@@ -165,6 +160,56 @@ class Catalogue:
            ]
        return book_details
    def extract_metadata_pdf(self, book):
        """ Return extracted metadata
        :NOTES: Retrieval of data has been problematic, some pdf's providing
        reliable titles that corespond with the actual, and others being
        nonsense.
        """
        ddg = DuckDuckGo()
        try:
            pdf = PyPDF2.PdfFileReader(book)
        except Exception:
            return None
        try:
            # Getting odd errors on when attempting to access some pdfs
            # where they would report as encrypted, when not.
            info = pdf.getDocumentInfo()
            if info is None:
                # check to ensure we actually have a pdf
                return None
        except Exception:
            return None
        fname = book.__str__()
        title = book.split("/")[-1].rsplit(".", 1)[0]
        title = title.replace("_", " ")
        if info.author is None:
            author = None
        else:
            author = info.author
        try:
            cover_image = ddg.image_result(title)
        except:
            cover_image = None
        description = ddg.description_result(title)
        identifier = None
        publisher = None
        date = None
        rights = None
        ftags = None
        return [
            title,
            author,
            cover_image,
            fname,
            description,
            identifier,
            publisher,
            date,
            rights,
            ftags,
        ]
    @staticmethod
    def stripTags(source):
        p = re.compile(r"<.*?>")
@@ -276,7 +321,10 @@ class Catalogue:
        for book in book_list:
            book = self.process_by_filetype(book)
            with open(fsocket, 'w') as _socket:
-                _socket.write(book[0])
+                try:
                    _socket.write(book[0])
                except TypeError:
                    continue
            _socket.close()
            db.insert_book(book)
        inserted = db.commit()