From 13fc85f6ffa79b2bffad7aba0fe6b7439174b3e8 Mon Sep 17 00:00:00 2001 From: Raelon Masters Date: Sat, 20 Jun 2020 23:28:20 -0400 Subject: [PATCH] Added sanitization to book title collection. Stripping patterns mathching Book 0-9*+ - --- src/backend/lib/library.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/backend/lib/library.py b/src/backend/lib/library.py index 2013e1b..737d09e 100755 --- a/src/backend/lib/library.py +++ b/src/backend/lib/library.py @@ -26,6 +26,8 @@ class Catalogue: self.opf_regx = re.compile(r"\.opf") self.cover_regx = re.compile(r"\.jpg|\.jpeg|\.png|\.bmp|\.gif") self.html_regx = re.compile(r"\.html") + self.title_sanitization_regx = re.compile(r"^(Book )+[0-9]*") + self.title_sanitization_lvl2_regx = re.compile(r"^(Book )+[0-9]*\W+(-)") self.root_dir = config.root self.book_folder = config.book_path self.books = None @@ -69,6 +71,7 @@ class Catalogue: """ def process_by_filetype(self, book): + print(str(book), end='\r', flush=True) if book.endswith(".epub"): epub = self.process_epub(book) return self.extract_metadata_epub(epub) @@ -107,6 +110,12 @@ class Catalogue: title = book["path"].split("/")[-1].rsplit(".", 1)[0] else: title = title.contents[0] + if re.match(self.title_sanitization_regx, title): + breakpoint() + if re.match(self.title_sanitization_lvl2_regx, title): + title = re.split(r"-+\W", title)[1] + else: title = re.split(self.title_sanitization_regx, title)[2] + author = soup.find("dc:creator") if author is not None: author = author.contents[0]