PDF Support, Description, & image acquisition needs work.

2026-06-26 23:11:38 -04:00 · 2020-12-12 13:24:17 -05:00
parent 11fe1c7d40
commit 7784f12c29
3 changed files with 79 additions and 13 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,3 +26,4 @@ ptvsd
 pudb
 daphne
 whitenoise
+pypdf2
--- a/src/backend/lib/api_hooks.py
+++ b/src/backend/lib/api_hooks.py
@@ -1,8 +1,6 @@
 #!/usr/bin/python
-import sys
-
 import requests
-
+import json
 # sys.path.insert(1, 'lib/')


@@ -33,3 +31,22 @@ class DuckDuckGo:
            return image.raw
        else:
            return False
+
+    def description_result(self, query):
+        _key = "&format=json"
+        try:
+            query = query.string
+        except AttributeError:
+            pass
+        try:
+            _r = json.loads(requests.get(self.url + query + _key).text)
+        except Exception as e:
+            return None
+        if len(_r["Results"]) == 0:
+            return None
+        else:
+            try:
+                return _r["Abstract"]
+            except AttributeError:
+                return None
+            return _r.Results[0]
--- a/src/backend/lib/library.py
+++ b/src/backend/lib/library.py
@@ -1,15 +1,13 @@
 #!/usr/bin/env python
-import json
 import os
-import pathlib
 import re
 import zipfile
+import PyPDF2

 from bs4 import BeautifulSoup
 from mobi import Mobi

 from .api_hooks import DuckDuckGo
-from .config import Config
 from .storage import Storage


@@ -57,16 +55,11 @@ class Catalogue:
        :returns self._book_list_expanded: json string containing all book metadata
        """
        self.scan_folder()  # Populate file list
-        regx = re.compile(r"\.epub|\.mobi")
+        regx = re.compile(r"\.epub|\.mobi|\.pdf")
        try:
            self.books = list(filter(regx.search, filter(None, self.file_list)))
        except TypeError as e:
            self.config.logger.error(e)
-        """
-        for book in self.books:
-            self._book_list_expanded[book] = self.process_by_filetype(book)
-        return self._book_list_expanded
-        """

    def process_by_filetype(self, book):
        if book.endswith(".epub"):
@@ -74,6 +67,8 @@ class Catalogue:
            return self.extract_metadata_epub(epub)
        elif book.endswith(".mobi"):
            return self.extract_metadata_mobi(book)
+        elif book.endswith(".pdf"):
+            return self.extract_metadata_pdf(book)

    @staticmethod
    def process_epub(book):
@@ -165,6 +160,56 @@ class Catalogue:
            ]
        return book_details

+    def extract_metadata_pdf(self, book):
+        """ Return extracted metadata
+        :NOTES: Retrieval of data has been problematic, some pdf's providing
+        reliable titles that corespond with the actual, and others being
+        nonsense.
+        """
+        ddg = DuckDuckGo()
+        try:
+            pdf = PyPDF2.PdfFileReader(book)
+        except Exception:
+            return None
+        try:
+            # Getting odd errors on when attempting to access some pdfs
+            # where they would report as encrypted, when not.
+            info = pdf.getDocumentInfo()
+            if info is None:
+                # check to ensure we actually have a pdf
+                return None
+        except Exception:
+            return None
+        fname = book.__str__()
+        title = book.split("/")[-1].rsplit(".", 1)[0]
+        title = title.replace("_", " ")
+        if info.author is None:
+            author = None
+        else:
+            author = info.author
+        try:
+            cover_image = ddg.image_result(title)
+        except:
+            cover_image = None
+        description = ddg.description_result(title)
+        identifier = None
+        publisher = None
+        date = None
+        rights = None
+        ftags = None
+        return [
+            title,
+            author,
+            cover_image,
+            fname,
+            description,
+            identifier,
+            publisher,
+            date,
+            rights,
+            ftags,
+        ]
+
    @staticmethod
    def stripTags(source):
        p = re.compile(r"<.*?>")
@@ -276,7 +321,10 @@ class Catalogue:
        for book in book_list:
            book = self.process_by_filetype(book)
            with open(fsocket, 'w') as _socket:
-                _socket.write(book[0])
+                try:
+                    _socket.write(book[0])
+                except TypeError:
+                    continue
            _socket.close()
            db.insert_book(book)
        inserted = db.commit()