#!/usr/bin/python import json import os import re import zipfile from PIL import Image from bs4 import BeautifulSoup from config import Config from api_hooks import DuckDuckGo config = Config() class Catalogue: """Decodes and stores book information""" """Step One: filter_books""" def __init__(self): self.file_list = [] self.opf_regx = re.compile(r'\.opf') self.cover_regx = re.compile(r'\.jpg|\.jpeg|\.png|\.bmp|\.gif') self.html_regx = re.compile(r'\.html') """ with open(config.book_shelf, 'r') as f: try: self.catalogue = json.load(f) self.current_files = self.scan_folder() except Exception: self.filter_books() """ def scan_folder(self, folder=config.book_path): for f in os.listdir(folder): _path = os.path.abspath(folder+'/'+f) #_path = os.path.abspath('.')+'/'+folder+f+'/' _is_dir = os.path.isdir(_path.strip()+'/') if _is_dir: self.file_list.append(self.scan_folder(_path)) self.file_list.append(_path) def scan_book(self, book): """REMOVE ME?""" _epub = zipfile.ZipFile(book) with _epub as _epub_open: try: _epub_open.open('content.opf'); return True except Exception as e: print(e); return False def filter_books(self): """ Scan book folder recursively for epub files filter_books(0) -> Catalogue.books filter_books(1) -> self.books[] :param ret: 0 -> create class property -> dump json :param ret: 1 -> create & return class property """ self.scan_folder() regx = re.compile(r"\.epub") self.books = list(filter(regx.search, filter(None, self.file_list))) _book_list_expanded = {} with open(config.book_shelf, 'w') as f: for book in self.books: _book_list_expanded[book] = self.process_book(book) json.dump(_book_list_expanded, f) return _book_list_expanded def process_book(self, book): """Return dictionary of epub file contents""" f_name = 'content.opf' book = zipfile.ZipFile(book, 'r') details = {} with book as book_zip: details['files'] = [] details['path'] = book.filename expanded = book_zip.infolist() regx = re.compile(r'\.opf|cover') for i in expanded: match = re.search(regx, i.filename) if match: # Returns zip file location of requested files details['files'].append(match.string) return details def extract_metadata(self, book): """ Return extracted metadata and cover picture book['path'] == Full path to ebook file book['files'] == list of files from self.process_book(book) """ book_zip = zipfile.ZipFile(book['path'], 'r') with book_zip as f: content = self.extract_content(book_zip, book) soup = BeautifulSoup(content, "lxml") title = soup.find("dc:title") if title == None: title = book['path'].split('/')[-1].rsplit('.', 1)[0] else: title = title.contents[0] author = soup.find("dc:creator") if author != None: author = author.contents[0] try: cover = self.extract_cover_image(book_zip, book) except IndexError: # cover = self.extract_cover_html(book_zip, book) cover = DuckDuckGo().image_result(title) book_details = [title, author, cover, book['path']] return book_details def extract_content(self, book_zip, book): content = book_zip.open( list( filter(self.opf_regx.search, book['files']) )[0] ) return content def extract_cover_html(self, book_zip, book): cover = book_zip.open( list( filter(self.html_regx.search, book['files']) )[0] ) return cover def extract_cover_image(self, book_zip, book): # TODO Handle books that have no Cover Image # TODO Handle books with html covers cover = book_zip.open( list( filter(self.cover_regx.search, book['files']) )[0] ) try: cover = book_zip.read(cover.name); return cover except KeyError: return False def compare_shelf_current(self): try: self.books except Exception: self.filter_books() unique = set(self.books) - set(self.catalogue) return unique