mirror of
https://github.com/th3r00t/pyShelf.git
synced 2026-04-28 01:59:35 -04:00
Added sanitization to book title collection. Stripping patterns mathching Book 0-9*+ -
This commit is contained in:
@@ -26,6 +26,8 @@ class Catalogue:
|
|||||||
self.opf_regx = re.compile(r"\.opf")
|
self.opf_regx = re.compile(r"\.opf")
|
||||||
self.cover_regx = re.compile(r"\.jpg|\.jpeg|\.png|\.bmp|\.gif")
|
self.cover_regx = re.compile(r"\.jpg|\.jpeg|\.png|\.bmp|\.gif")
|
||||||
self.html_regx = re.compile(r"\.html")
|
self.html_regx = re.compile(r"\.html")
|
||||||
|
self.title_sanitization_regx = re.compile(r"^(Book )+[0-9]*")
|
||||||
|
self.title_sanitization_lvl2_regx = re.compile(r"^(Book )+[0-9]*\W+(-)")
|
||||||
self.root_dir = config.root
|
self.root_dir = config.root
|
||||||
self.book_folder = config.book_path
|
self.book_folder = config.book_path
|
||||||
self.books = None
|
self.books = None
|
||||||
@@ -69,6 +71,7 @@ class Catalogue:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def process_by_filetype(self, book):
|
def process_by_filetype(self, book):
|
||||||
|
print(str(book), end='\r', flush=True)
|
||||||
if book.endswith(".epub"):
|
if book.endswith(".epub"):
|
||||||
epub = self.process_epub(book)
|
epub = self.process_epub(book)
|
||||||
return self.extract_metadata_epub(epub)
|
return self.extract_metadata_epub(epub)
|
||||||
@@ -107,6 +110,12 @@ class Catalogue:
|
|||||||
title = book["path"].split("/")[-1].rsplit(".", 1)[0]
|
title = book["path"].split("/")[-1].rsplit(".", 1)[0]
|
||||||
else:
|
else:
|
||||||
title = title.contents[0]
|
title = title.contents[0]
|
||||||
|
if re.match(self.title_sanitization_regx, title):
|
||||||
|
breakpoint()
|
||||||
|
if re.match(self.title_sanitization_lvl2_regx, title):
|
||||||
|
title = re.split(r"-+\W", title)[1]
|
||||||
|
else: title = re.split(self.title_sanitization_regx, title)[2]
|
||||||
|
|
||||||
author = soup.find("dc:creator")
|
author = soup.find("dc:creator")
|
||||||
if author is not None:
|
if author is not None:
|
||||||
author = author.contents[0]
|
author = author.contents[0]
|
||||||
|
|||||||
Reference in New Issue
Block a user