mirror of
https://github.com/th3r00t/pyShelf.git
synced 2026-04-28 01:59:35 -04:00
Began extracting data from epub files
This commit is contained in:
96
lib/library.py
Executable file
96
lib/library.py
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/python
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from bs4 import BeautifulSoup
|
||||
from config import Config
|
||||
config = Config()
|
||||
|
||||
|
||||
class Catalogue:
|
||||
"""Decodes and stores book information"""
|
||||
"""Step One: filter_books"""
|
||||
def __init__(self):
|
||||
self.file_list = []
|
||||
with open(config.book_shelf, 'r') as f:
|
||||
try:
|
||||
self.catalogue = json.load(f)
|
||||
self.current_files = self.scan_folder()
|
||||
except Exception:
|
||||
self.filter_books()
|
||||
|
||||
def scan_folder(self, folder=config.book_path):
|
||||
for f in os.listdir(folder):
|
||||
_path = os.path.abspath(folder+'/'+f)
|
||||
#_path = os.path.abspath('.')+'/'+folder+f+'/'
|
||||
_is_dir = os.path.isdir(_path.strip()+'/')
|
||||
if _is_dir:
|
||||
self.file_list.append(self.scan_folder(_path))
|
||||
self.file_list.append(_path)
|
||||
|
||||
def scan_book(self, book):
|
||||
"""REMOVE ME?"""
|
||||
_epub = zipfile.ZipFile(book)
|
||||
with _epub as _epub_open:
|
||||
try: _epub_open.open('content.opf'); return True
|
||||
except Exception as e: print(e); return False
|
||||
|
||||
def filter_books(self):
|
||||
"""
|
||||
Scan book folder recursively for epub files
|
||||
filter_books(0) -> Catalogue.books
|
||||
filter_books(1) -> self.books[]
|
||||
:param ret: 0 -> create class property -> dump json
|
||||
:param ret: 1 -> create & return class property
|
||||
"""
|
||||
self.scan_folder()
|
||||
regx = re.compile(r"\.epub")
|
||||
self.books = list(filter(regx.search, filter(None, self.file_list)))
|
||||
_book_list_expanded = {}
|
||||
with open(config.book_shelf, 'w') as f:
|
||||
for book in self.books:
|
||||
_book_list_expanded[book] = self.process_book(book)
|
||||
json.dump(_book_list_expanded, f)
|
||||
return _book_list_expanded
|
||||
|
||||
def process_book(self, book):
|
||||
"""Return dictionary of epub file contents"""
|
||||
f_name = 'content.opf'
|
||||
book = zipfile.ZipFile(book, 'r')
|
||||
details = {}
|
||||
with book as book_zip:
|
||||
details['files'] = []
|
||||
details['path'] = book.filename
|
||||
expanded = book_zip.infolist()
|
||||
regx = re.compile(r'\.opf|cover')
|
||||
for i in expanded:
|
||||
match = re.search(regx, i.filename)
|
||||
if match:
|
||||
# Returns zip file location of requested files
|
||||
details['files'].append(match.string)
|
||||
return details
|
||||
|
||||
def extract_metadata(self, book):
|
||||
"""
|
||||
Return extracted metadata and cover picture
|
||||
book['path'] == Full path to ebook file
|
||||
book['files'] == list of files from self.process_book(book)
|
||||
"""
|
||||
book_zip = zipfile.ZipFile(book['path'], 'r')
|
||||
opf_regx, cover_regx = re.compile(r'\.opf'), re.compile(r'\.jpg|\.png|\.bmp|\.gif')
|
||||
with book_zip as f:
|
||||
content = list(filter(opf_regx.search, book['files']))
|
||||
content = book_zip.open(content[0])
|
||||
soup = BeautifulSoup(content, "xml")
|
||||
title = soup.find("dc:title")
|
||||
author = soup.find("dc:creator")
|
||||
cover = soup.find("meta", attrs={"name" : "cover"})
|
||||
return title
|
||||
def compare_shelf_current(self):
|
||||
try:
|
||||
self.books
|
||||
except Exception:
|
||||
self.filter_books()
|
||||
unique = set(self.books) - set(self.catalogue)
|
||||
return unique
|
||||
Reference in New Issue
Block a user