PDF Support, Description, & image acquisition needs work.

This commit is contained in:
th3r00t
2020-12-12 13:24:17 -05:00
parent 11fe1c7d40
commit 7784f12c29
3 changed files with 79 additions and 13 deletions

View File

@@ -26,3 +26,4 @@ ptvsd
pudb
daphne
whitenoise
pypdf2

View File

@@ -1,8 +1,6 @@
#!/usr/bin/python
import sys
import requests
import json
# sys.path.insert(1, 'lib/')
@@ -33,3 +31,22 @@ class DuckDuckGo:
return image.raw
else:
return False
def description_result(self, query):
_key = "&format=json"
try:
query = query.string
except AttributeError:
pass
try:
_r = json.loads(requests.get(self.url + query + _key).text)
except Exception as e:
return None
if len(_r["Results"]) == 0:
return None
else:
try:
return _r["Abstract"]
except AttributeError:
return None
return _r.Results[0]

View File

@@ -1,15 +1,13 @@
#!/usr/bin/env python
import json
import os
import pathlib
import re
import zipfile
import PyPDF2
from bs4 import BeautifulSoup
from mobi import Mobi
from .api_hooks import DuckDuckGo
from .config import Config
from .storage import Storage
@@ -57,16 +55,11 @@ class Catalogue:
:returns self._book_list_expanded: json string containing all book metadata
"""
self.scan_folder() # Populate file list
regx = re.compile(r"\.epub|\.mobi")
regx = re.compile(r"\.epub|\.mobi|\.pdf")
try:
self.books = list(filter(regx.search, filter(None, self.file_list)))
except TypeError as e:
self.config.logger.error(e)
"""
for book in self.books:
self._book_list_expanded[book] = self.process_by_filetype(book)
return self._book_list_expanded
"""
def process_by_filetype(self, book):
if book.endswith(".epub"):
@@ -74,6 +67,8 @@ class Catalogue:
return self.extract_metadata_epub(epub)
elif book.endswith(".mobi"):
return self.extract_metadata_mobi(book)
elif book.endswith(".pdf"):
return self.extract_metadata_pdf(book)
@staticmethod
def process_epub(book):
@@ -165,6 +160,56 @@ class Catalogue:
]
return book_details
def extract_metadata_pdf(self, book):
""" Return extracted metadata
:NOTES: Retrieval of data has been problematic, some pdf's providing
reliable titles that corespond with the actual, and others being
nonsense.
"""
ddg = DuckDuckGo()
try:
pdf = PyPDF2.PdfFileReader(book)
except Exception:
return None
try:
# Getting odd errors on when attempting to access some pdfs
# where they would report as encrypted, when not.
info = pdf.getDocumentInfo()
if info is None:
# check to ensure we actually have a pdf
return None
except Exception:
return None
fname = book.__str__()
title = book.split("/")[-1].rsplit(".", 1)[0]
title = title.replace("_", " ")
if info.author is None:
author = None
else:
author = info.author
try:
cover_image = ddg.image_result(title)
except:
cover_image = None
description = ddg.description_result(title)
identifier = None
publisher = None
date = None
rights = None
ftags = None
return [
title,
author,
cover_image,
fname,
description,
identifier,
publisher,
date,
rights,
ftags,
]
@staticmethod
def stripTags(source):
p = re.compile(r"<.*?>")
@@ -276,7 +321,10 @@ class Catalogue:
for book in book_list:
book = self.process_by_filetype(book)
with open(fsocket, 'w') as _socket:
_socket.write(book[0])
try:
_socket.write(book[0])
except TypeError:
continue
_socket.close()
db.insert_book(book)
inserted = db.commit()