mirror of
https://github.com/th3r00t/pyShelf.git
synced 2026-04-28 01:59:35 -04:00
PDF Support, Description, & image acquisition needs work.
This commit is contained in:
@@ -26,3 +26,4 @@ ptvsd
|
|||||||
pudb
|
pudb
|
||||||
daphne
|
daphne
|
||||||
whitenoise
|
whitenoise
|
||||||
|
pypdf2
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
import sys
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
import json
|
||||||
# sys.path.insert(1, 'lib/')
|
# sys.path.insert(1, 'lib/')
|
||||||
|
|
||||||
|
|
||||||
@@ -33,3 +31,22 @@ class DuckDuckGo:
|
|||||||
return image.raw
|
return image.raw
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def description_result(self, query):
|
||||||
|
_key = "&format=json"
|
||||||
|
try:
|
||||||
|
query = query.string
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_r = json.loads(requests.get(self.url + query + _key).text)
|
||||||
|
except Exception as e:
|
||||||
|
return None
|
||||||
|
if len(_r["Results"]) == 0:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
return _r["Abstract"]
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
|
return _r.Results[0]
|
||||||
|
|||||||
@@ -1,15 +1,13 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import pathlib
|
|
||||||
import re
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import PyPDF2
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from mobi import Mobi
|
from mobi import Mobi
|
||||||
|
|
||||||
from .api_hooks import DuckDuckGo
|
from .api_hooks import DuckDuckGo
|
||||||
from .config import Config
|
|
||||||
from .storage import Storage
|
from .storage import Storage
|
||||||
|
|
||||||
|
|
||||||
@@ -57,16 +55,11 @@ class Catalogue:
|
|||||||
:returns self._book_list_expanded: json string containing all book metadata
|
:returns self._book_list_expanded: json string containing all book metadata
|
||||||
"""
|
"""
|
||||||
self.scan_folder() # Populate file list
|
self.scan_folder() # Populate file list
|
||||||
regx = re.compile(r"\.epub|\.mobi")
|
regx = re.compile(r"\.epub|\.mobi|\.pdf")
|
||||||
try:
|
try:
|
||||||
self.books = list(filter(regx.search, filter(None, self.file_list)))
|
self.books = list(filter(regx.search, filter(None, self.file_list)))
|
||||||
except TypeError as e:
|
except TypeError as e:
|
||||||
self.config.logger.error(e)
|
self.config.logger.error(e)
|
||||||
"""
|
|
||||||
for book in self.books:
|
|
||||||
self._book_list_expanded[book] = self.process_by_filetype(book)
|
|
||||||
return self._book_list_expanded
|
|
||||||
"""
|
|
||||||
|
|
||||||
def process_by_filetype(self, book):
|
def process_by_filetype(self, book):
|
||||||
if book.endswith(".epub"):
|
if book.endswith(".epub"):
|
||||||
@@ -74,6 +67,8 @@ class Catalogue:
|
|||||||
return self.extract_metadata_epub(epub)
|
return self.extract_metadata_epub(epub)
|
||||||
elif book.endswith(".mobi"):
|
elif book.endswith(".mobi"):
|
||||||
return self.extract_metadata_mobi(book)
|
return self.extract_metadata_mobi(book)
|
||||||
|
elif book.endswith(".pdf"):
|
||||||
|
return self.extract_metadata_pdf(book)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def process_epub(book):
|
def process_epub(book):
|
||||||
@@ -165,6 +160,56 @@ class Catalogue:
|
|||||||
]
|
]
|
||||||
return book_details
|
return book_details
|
||||||
|
|
||||||
|
def extract_metadata_pdf(self, book):
|
||||||
|
""" Return extracted metadata
|
||||||
|
:NOTES: Retrieval of data has been problematic, some pdf's providing
|
||||||
|
reliable titles that corespond with the actual, and others being
|
||||||
|
nonsense.
|
||||||
|
"""
|
||||||
|
ddg = DuckDuckGo()
|
||||||
|
try:
|
||||||
|
pdf = PyPDF2.PdfFileReader(book)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
# Getting odd errors on when attempting to access some pdfs
|
||||||
|
# where they would report as encrypted, when not.
|
||||||
|
info = pdf.getDocumentInfo()
|
||||||
|
if info is None:
|
||||||
|
# check to ensure we actually have a pdf
|
||||||
|
return None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
fname = book.__str__()
|
||||||
|
title = book.split("/")[-1].rsplit(".", 1)[0]
|
||||||
|
title = title.replace("_", " ")
|
||||||
|
if info.author is None:
|
||||||
|
author = None
|
||||||
|
else:
|
||||||
|
author = info.author
|
||||||
|
try:
|
||||||
|
cover_image = ddg.image_result(title)
|
||||||
|
except:
|
||||||
|
cover_image = None
|
||||||
|
description = ddg.description_result(title)
|
||||||
|
identifier = None
|
||||||
|
publisher = None
|
||||||
|
date = None
|
||||||
|
rights = None
|
||||||
|
ftags = None
|
||||||
|
return [
|
||||||
|
title,
|
||||||
|
author,
|
||||||
|
cover_image,
|
||||||
|
fname,
|
||||||
|
description,
|
||||||
|
identifier,
|
||||||
|
publisher,
|
||||||
|
date,
|
||||||
|
rights,
|
||||||
|
ftags,
|
||||||
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def stripTags(source):
|
def stripTags(source):
|
||||||
p = re.compile(r"<.*?>")
|
p = re.compile(r"<.*?>")
|
||||||
@@ -276,7 +321,10 @@ class Catalogue:
|
|||||||
for book in book_list:
|
for book in book_list:
|
||||||
book = self.process_by_filetype(book)
|
book = self.process_by_filetype(book)
|
||||||
with open(fsocket, 'w') as _socket:
|
with open(fsocket, 'w') as _socket:
|
||||||
_socket.write(book[0])
|
try:
|
||||||
|
_socket.write(book[0])
|
||||||
|
except TypeError:
|
||||||
|
continue
|
||||||
_socket.close()
|
_socket.close()
|
||||||
db.insert_book(book)
|
db.insert_book(book)
|
||||||
inserted = db.commit()
|
inserted = db.commit()
|
||||||
|
|||||||
Reference in New Issue
Block a user