First push with mobi support

This commit is contained in:
Raelon Masters
2020-05-31 01:20:52 -04:00
parent ac1a166ae5
commit 7ee9c57ed1
6 changed files with 43 additions and 21 deletions

View File

@@ -7,6 +7,8 @@ import zipfile
from bs4 import BeautifulSoup
from mobi import Mobi
from .api_hooks import DuckDuckGo
from .config import Config
from .storage import Storage
@@ -26,8 +28,7 @@ class Catalogue:
self.html_regx = re.compile(r"\.html")
self.root_dir = config.root
self.book_folder = config.book_path
self.book_shelf = config.book_shelf
self._book_list_expanded = None
# self.book_shelf = config.book_shelf
self.books = None
self.db_pointer = config.catalogue_db
self.config = config
@@ -56,24 +57,30 @@ class Catalogue:
:returns self._book_list_expanded: json string containing all book metadata
"""
self.scan_folder() # Populate file list
regx = re.compile(r"\.epub")
self.scan_folder() # Populate file list
regx = re.compile(r"\.epub|\.mobi")
try:
self.books = list(filter(regx.search, filter(None, self.file_list)))
except TypeError as e:
print(e)
self._book_list_expanded = {}
with open(self.book_shelf, "w") as f:
for book in self.books:
self._book_list_expanded[book] = self.process_book(book)
json.dump(self._book_list_expanded, f)
"""
for book in self.books:
self._book_list_expanded[book] = self.process_by_filetype(book)
return self._book_list_expanded
"""
def process_by_filetype(self, book):
if book.endswith(".epub"):
epub = self.process_epub(book)
return self.extract_metadata_epub(epub)
elif book.endswith(".mobi"):
return self.extract_metadata_mobi(book)
@staticmethod
def process_book(book):
def process_epub(book):
"""Return dictionary of epub file contents"""
book = zipfile.ZipFile(book, "r")
details = {}
book = zipfile.ZipFile(book, "r")
with book as book_zip:
details["files"] = []
details["path"] = book.filename
@@ -86,7 +93,7 @@ class Catalogue:
details["files"].append(match.string)
return details
def extract_metadata(self, book):
def extract_metadata_epub(self, book):
"""
Return extracted metadata and cover picture
book['path'] == Full path to ebook file
@@ -94,7 +101,7 @@ class Catalogue:
"""
book_zip = zipfile.ZipFile(book["path"], "r")
with book_zip as f:
content = self.extract_content(book_zip, book)
content = self.extract_content(f, book)
soup = BeautifulSoup(content, "lxml")
title = soup.find("dc:title")
if title is None:
@@ -105,13 +112,27 @@ class Catalogue:
if author is not None:
author = author.contents[0]
try:
cover = self.extract_cover_image(book_zip, book)
cover = self.extract_cover_image(f, book)
except IndexError:
# cover = self.extract_cover_html(book_zip, book)
cover = DuckDuckGo().image_result(title)
book_details = [title, author, cover, book["path"]]
return book_details
@staticmethod
def extract_metadata_mobi(book):
book = Mobi(book)
book.parse()
try:
cover_image = book.readImageRecord(0)
except KeyError:
cover_image = None
title = book.title().decode("utf-8")
author = book.author().decode(
"utf-8"
) # TODO some files are still passing encoded data for author.
return [title, author, cover_image, book.f.name]
def extract_content(self, book_zip, book):
"""
Opens epub as zip file filters then stores as list any files matching opf_regx
@@ -161,12 +182,12 @@ class Catalogue:
Gets a list of new files via compare_shelf_current.
Iterates over list and inserts new books into database.
"""
# TODO Refactor metadata extraction into process_book call to more easily handle additional formats
book_list = self.compare_shelf_current()
db = Storage(self.config)
for book in book_list:
book = self.process_book(book)
extracted = self.extract_metadata(book)
db.insert_book(extracted)
book = self.process_by_filetype(book)
db.insert_book(book)
inserted = db.commit()
if inserted is not True:
print(inserted)