Began extracting data from epub files

2026-04-28 01:59:35 -04:00 · 2019-10-01 17:30:36 -04:00
parent 81fe90bce5
commit 8e90fc851c
18 changed files with 210 additions and 150 deletions
--- a/lib/library.py
+++ b/lib/library.py
@@ -0,0 +1,96 @@
+#!/usr/bin/python
+import json
+import os
+import re
+import zipfile
+from bs4 import BeautifulSoup
+from config import Config
+config = Config()
+
+
+class Catalogue:
+    """Decodes and stores book information"""
+    """Step One: filter_books"""
+    def __init__(self):
+        self.file_list = []
+        with open(config.book_shelf, 'r') as f:
+            try:
+                self.catalogue = json.load(f)
+                self.current_files = self.scan_folder()
+            except Exception:
+                self.filter_books()
+
+    def scan_folder(self, folder=config.book_path):
+        for f in os.listdir(folder):
+            _path = os.path.abspath(folder+'/'+f)
+            #_path = os.path.abspath('.')+'/'+folder+f+'/'
+            _is_dir = os.path.isdir(_path.strip()+'/')
+            if _is_dir:
+                self.file_list.append(self.scan_folder(_path))
+            self.file_list.append(_path)
+
+    def scan_book(self, book):
+        """REMOVE ME?"""
+        _epub = zipfile.ZipFile(book)
+        with _epub as _epub_open:
+            try: _epub_open.open('content.opf'); return True
+            except Exception as e: print(e); return False
+
+    def filter_books(self):
+        """
+        Scan book folder recursively for epub files
+        filter_books(0) -> Catalogue.books
+        filter_books(1) -> self.books[]
+        :param ret: 0 -> create class property -> dump json
+        :param ret: 1 -> create & return class property
+        """
+        self.scan_folder()
+        regx = re.compile(r"\.epub")
+        self.books = list(filter(regx.search, filter(None, self.file_list)))
+        _book_list_expanded = {}
+        with open(config.book_shelf, 'w') as f:
+            for book in self.books:
+                _book_list_expanded[book] = self.process_book(book)
+            json.dump(_book_list_expanded, f)
+        return _book_list_expanded
+
+    def process_book(self, book):
+        """Return dictionary of epub file contents"""
+        f_name = 'content.opf'
+        book = zipfile.ZipFile(book, 'r')
+        details = {}
+        with book as book_zip:
+            details['files'] = []
+            details['path'] = book.filename
+            expanded = book_zip.infolist()
+            regx = re.compile(r'\.opf|cover')
+            for i in expanded:
+                match = re.search(regx, i.filename)
+                if match:
+                    # Returns zip file location of requested files
+                    details['files'].append(match.string)
+        return details
+
+    def extract_metadata(self, book):
+        """
+        Return extracted metadata and cover picture
+        book['path'] == Full path to ebook file
+        book['files'] == list of files from self.process_book(book)
+        """
+        book_zip = zipfile.ZipFile(book['path'], 'r')
+        opf_regx, cover_regx = re.compile(r'\.opf'), re.compile(r'\.jpg|\.png|\.bmp|\.gif')
+        with book_zip as f:
+            content = list(filter(opf_regx.search, book['files']))
+            content = book_zip.open(content[0])
+            soup = BeautifulSoup(content, "xml")
+            title = soup.find("dc:title")
+            author = soup.find("dc:creator")
+            cover = soup.find("meta", attrs={"name" : "cover"})
+        return title
+    def compare_shelf_current(self):
+        try:
+            self.books
+        except Exception:
+            self.filter_books()
+        unique = set(self.books) - set(self.catalogue)
+        return unique