PDF Support, Description, & image acquisition needs work.

This commit is contained in:
th3r00t
2020-12-12 13:24:17 -05:00
parent 11fe1c7d40
commit 7784f12c29
3 changed files with 79 additions and 13 deletions

View File

@@ -26,3 +26,4 @@ ptvsd
pudb pudb
daphne daphne
whitenoise whitenoise
pypdf2

View File

@@ -1,8 +1,6 @@
#!/usr/bin/python #!/usr/bin/python
import sys
import requests import requests
import json
# sys.path.insert(1, 'lib/') # sys.path.insert(1, 'lib/')
@@ -33,3 +31,22 @@ class DuckDuckGo:
return image.raw return image.raw
else: else:
return False return False
def description_result(self, query):
_key = "&format=json"
try:
query = query.string
except AttributeError:
pass
try:
_r = json.loads(requests.get(self.url + query + _key).text)
except Exception as e:
return None
if len(_r["Results"]) == 0:
return None
else:
try:
return _r["Abstract"]
except AttributeError:
return None
return _r.Results[0]

View File

@@ -1,15 +1,13 @@
#!/usr/bin/env python #!/usr/bin/env python
import json
import os import os
import pathlib
import re import re
import zipfile import zipfile
import PyPDF2
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from mobi import Mobi from mobi import Mobi
from .api_hooks import DuckDuckGo from .api_hooks import DuckDuckGo
from .config import Config
from .storage import Storage from .storage import Storage
@@ -57,16 +55,11 @@ class Catalogue:
:returns self._book_list_expanded: json string containing all book metadata :returns self._book_list_expanded: json string containing all book metadata
""" """
self.scan_folder() # Populate file list self.scan_folder() # Populate file list
regx = re.compile(r"\.epub|\.mobi") regx = re.compile(r"\.epub|\.mobi|\.pdf")
try: try:
self.books = list(filter(regx.search, filter(None, self.file_list))) self.books = list(filter(regx.search, filter(None, self.file_list)))
except TypeError as e: except TypeError as e:
self.config.logger.error(e) self.config.logger.error(e)
"""
for book in self.books:
self._book_list_expanded[book] = self.process_by_filetype(book)
return self._book_list_expanded
"""
def process_by_filetype(self, book): def process_by_filetype(self, book):
if book.endswith(".epub"): if book.endswith(".epub"):
@@ -74,6 +67,8 @@ class Catalogue:
return self.extract_metadata_epub(epub) return self.extract_metadata_epub(epub)
elif book.endswith(".mobi"): elif book.endswith(".mobi"):
return self.extract_metadata_mobi(book) return self.extract_metadata_mobi(book)
elif book.endswith(".pdf"):
return self.extract_metadata_pdf(book)
@staticmethod @staticmethod
def process_epub(book): def process_epub(book):
@@ -165,6 +160,56 @@ class Catalogue:
] ]
return book_details return book_details
def extract_metadata_pdf(self, book):
""" Return extracted metadata
:NOTES: Retrieval of data has been problematic, some pdf's providing
reliable titles that corespond with the actual, and others being
nonsense.
"""
ddg = DuckDuckGo()
try:
pdf = PyPDF2.PdfFileReader(book)
except Exception:
return None
try:
# Getting odd errors on when attempting to access some pdfs
# where they would report as encrypted, when not.
info = pdf.getDocumentInfo()
if info is None:
# check to ensure we actually have a pdf
return None
except Exception:
return None
fname = book.__str__()
title = book.split("/")[-1].rsplit(".", 1)[0]
title = title.replace("_", " ")
if info.author is None:
author = None
else:
author = info.author
try:
cover_image = ddg.image_result(title)
except:
cover_image = None
description = ddg.description_result(title)
identifier = None
publisher = None
date = None
rights = None
ftags = None
return [
title,
author,
cover_image,
fname,
description,
identifier,
publisher,
date,
rights,
ftags,
]
@staticmethod @staticmethod
def stripTags(source): def stripTags(source):
p = re.compile(r"<.*?>") p = re.compile(r"<.*?>")
@@ -276,7 +321,10 @@ class Catalogue:
for book in book_list: for book in book_list:
book = self.process_by_filetype(book) book = self.process_by_filetype(book)
with open(fsocket, 'w') as _socket: with open(fsocket, 'w') as _socket:
_socket.write(book[0]) try:
_socket.write(book[0])
except TypeError:
continue
_socket.close() _socket.close()
db.insert_book(book) db.insert_book(book)
inserted = db.commit() inserted = db.commit()