diff --git a/requirements.txt b/requirements.txt index 78f4fa9..f86f4a7 100755 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ ptvsd pudb daphne whitenoise +pypdf2 diff --git a/src/backend/lib/api_hooks.py b/src/backend/lib/api_hooks.py index 54fc80f..f529a5d 100755 --- a/src/backend/lib/api_hooks.py +++ b/src/backend/lib/api_hooks.py @@ -1,8 +1,6 @@ #!/usr/bin/python -import sys - import requests - +import json # sys.path.insert(1, 'lib/') @@ -33,3 +31,22 @@ class DuckDuckGo: return image.raw else: return False + + def description_result(self, query): + _key = "&format=json" + try: + query = query.string + except AttributeError: + pass + try: + _r = json.loads(requests.get(self.url + query + _key).text) + except Exception as e: + return None + if len(_r["Results"]) == 0: + return None + else: + try: + return _r["Abstract"] + except AttributeError: + return None + return _r.Results[0] diff --git a/src/backend/lib/library.py b/src/backend/lib/library.py index 145d00e..e330780 100755 --- a/src/backend/lib/library.py +++ b/src/backend/lib/library.py @@ -1,15 +1,13 @@ #!/usr/bin/env python -import json import os -import pathlib import re import zipfile +import PyPDF2 from bs4 import BeautifulSoup from mobi import Mobi from .api_hooks import DuckDuckGo -from .config import Config from .storage import Storage @@ -57,16 +55,11 @@ class Catalogue: :returns self._book_list_expanded: json string containing all book metadata """ self.scan_folder() # Populate file list - regx = re.compile(r"\.epub|\.mobi") + regx = re.compile(r"\.epub|\.mobi|\.pdf") try: self.books = list(filter(regx.search, filter(None, self.file_list))) except TypeError as e: self.config.logger.error(e) - """ - for book in self.books: - self._book_list_expanded[book] = self.process_by_filetype(book) - return self._book_list_expanded - """ def process_by_filetype(self, book): if book.endswith(".epub"): @@ -74,6 +67,8 @@ class Catalogue: return self.extract_metadata_epub(epub) elif book.endswith(".mobi"): return self.extract_metadata_mobi(book) + elif book.endswith(".pdf"): + return self.extract_metadata_pdf(book) @staticmethod def process_epub(book): @@ -165,6 +160,56 @@ class Catalogue: ] return book_details + def extract_metadata_pdf(self, book): + """ Return extracted metadata + :NOTES: Retrieval of data has been problematic, some pdf's providing + reliable titles that corespond with the actual, and others being + nonsense. + """ + ddg = DuckDuckGo() + try: + pdf = PyPDF2.PdfFileReader(book) + except Exception: + return None + try: + # Getting odd errors on when attempting to access some pdfs + # where they would report as encrypted, when not. + info = pdf.getDocumentInfo() + if info is None: + # check to ensure we actually have a pdf + return None + except Exception: + return None + fname = book.__str__() + title = book.split("/")[-1].rsplit(".", 1)[0] + title = title.replace("_", " ") + if info.author is None: + author = None + else: + author = info.author + try: + cover_image = ddg.image_result(title) + except: + cover_image = None + description = ddg.description_result(title) + identifier = None + publisher = None + date = None + rights = None + ftags = None + return [ + title, + author, + cover_image, + fname, + description, + identifier, + publisher, + date, + rights, + ftags, + ] + @staticmethod def stripTags(source): p = re.compile(r"<.*?>") @@ -276,7 +321,10 @@ class Catalogue: for book in book_list: book = self.process_by_filetype(book) with open(fsocket, 'w') as _socket: - _socket.write(book[0]) + try: + _socket.write(book[0]) + except TypeError: + continue _socket.close() db.insert_book(book) inserted = db.commit()