From bb3aa701b2af491966f5f165cd6f5621669e4f34 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 26 Oct 2022 14:20:45 +0300 Subject: [PATCH 1/4] Add processing of float numbers in style --- src/style_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/style_reader.py b/src/style_reader.py index 8831f9a..d178e32 100644 --- a/src/style_reader.py +++ b/src/style_reader.py @@ -1,6 +1,6 @@ import re import cssutils -from typing import List, Tuple, Union +from typing import List, Tuple from os.path import dirname, normpath, join from src.util.color_reader import str2hex @@ -74,7 +74,7 @@ class StyleReader: def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str: size_number = float(size_number.replace(unit_to_replace, "")) * multiplier return str(size_number) + "px" - has_size = re.search(r"(\d+)([\w%]+)", size_value) + has_size = re.search(r"(\d+(?:\.\d+)?)([\w%]+)", size_value) values: List = size_value.split(" ") if has_size: size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0] From 9704fabff0fa1d0f0d6406179ce3012f72adaf7a Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 26 Oct 2022 14:21:45 +0300 Subject: [PATCH 2/4] Small changes in process_internal_links --- src/epub_converter/epub_converter.py | 109 ++++++++++++++------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/src/epub_converter/epub_converter.py b/src/epub_converter/epub_converter.py index 3cd55fb..206111f 100644 --- a/src/epub_converter/epub_converter.py +++ b/src/epub_converter/epub_converter.py @@ -339,7 +339,7 @@ class EpubConverter: normed_path = path.normpath(path.join( dir_name, href_in_link)).replace("\\", "/") full_path = [ - path for path in self.hrefs_added_to_toc if normed_path in path] + href_from_toc for href_from_toc in self.hrefs_added_to_toc if normed_path in href_from_toc] if not full_path: self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. " f"While processing href in {internal_link_tag}.") @@ -378,78 +378,79 @@ class EpubConverter: """ def make_ids_unique(): - for toc_href in self.hrefs_added_to_toc: - for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): + for href_from_toc in self.hrefs_added_to_toc: + for tag in self.html_href2html_body_soup[href_from_toc].find_all(attrs={"id": re.compile(r".+")}): if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]: - new_id = self.create_unique_id( - toc_href, tag.attrs["id"]) - tag.attrs["id"] = new_id + new_unique_id = self.create_unique_id( + href_from_toc, tag.attrs["id"]) + tag.attrs["id"] = new_unique_id def process_file_anchor(): - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - for internal_link_tag in soup.find_all("a", - {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}): - a_tag_href = internal_link_tag.attrs["href"] - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( - toc_href, a_tag_href, internal_link_tag) - if a_tag_href_matched_to_toc: - new_id = self.create_unique_id( - a_tag_href_matched_to_toc, "") - internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" - if new_id not in self.internal_anchors: - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - new_anchor_span = self.create_new_anchor_span( - soup, new_id) - # insert a new span to the beginning of the file - anchor_soup.insert(0, new_anchor_span) - self.internal_anchors.add(new_id) + def span_creation(): + if new_unique_id not in self.internal_anchors: + anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor] + new_anchor_span = self.create_new_anchor_span(html_content, new_unique_id) + # insert a new span to the beginning of the file + anchor_html_content.insert(0, new_anchor_span) + self.internal_anchors.add(new_unique_id) + + for html_href_from_toc in self.hrefs_added_to_toc: + html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc] + for internal_link_tag in html_content.find_all("a", + {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}): + href_ = internal_link_tag.attrs["href"] + html_href_of_anchor = self.match_href_to_path_from_toc( + html_href_from_toc, href_, internal_link_tag) + if html_href_of_anchor: + new_unique_id = self.create_unique_id(html_href_of_anchor, "") + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}" + span_creation() del internal_link_tag.attrs["href"] def process_file_element_anchor(): - for toc_href in self.hrefs_added_to_toc: - soup = self.html_href2html_body_soup[toc_href] - # process_file_element_anchor - for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}): - a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split( - "#") - a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( - toc_href, a_tag_href, internal_link_tag) if a_tag_href \ - else path.normpath(toc_href).replace("\\", "/") - if a_tag_href_matched_to_toc: - new_id = self.create_unique_id( - a_tag_href_matched_to_toc, a_tag_id) + def span_creation(): + if anchor_tag.attrs["id"] not in self.internal_anchors: + new_anchor_span = self.create_new_anchor_span( + html_content, new_unique_id) + anchor_tag.insert_before(new_anchor_span) + self.internal_anchors.add(new_unique_id) + del anchor_tag.attrs["id"] - anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] - anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \ - anchor_soup.find_all( - attrs={"id": a_tag_id}) # if link is a footnote + for html_href_from_toc in self.hrefs_added_to_toc: + html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc] + # process_file_element_anchor + for internal_link_tag in html_content.find_all("a", + {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}): + href_, id_ = internal_link_tag.attrs["href"].split("#") + html_href_of_anchor = self.match_href_to_path_from_toc( + html_href_from_toc, href_, internal_link_tag) if href_ \ + else path.normpath(html_href_from_toc).replace("\\", "/") # the same page + if html_href_of_anchor: + new_unique_id = self.create_unique_id(html_href_of_anchor, id_) + + anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor] + anchor_tags = anchor_html_content.find_all(attrs={"id": new_unique_id}) or \ + anchor_html_content.find_all(attrs={"id": id_}) # if link is a footnote if anchor_tags: if len(anchor_tags) > 1: - self.logger.log(f"Warning in {toc_href}: multiple anchors:" + self.logger.log(f"Warning in {html_href_from_toc}: multiple anchors:" f"{len(anchor_tags)} found.\n" f"{anchor_tags}\n" f"While processing {internal_link_tag}") - anchor_tag = anchor_tags[0] - assert anchor_tag.attrs["id"] in [new_id, a_tag_id] + assert anchor_tag.attrs["id"] in [new_unique_id, id_] # if anchor is found we could add placeholder for link creation on server side. - internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" + internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}" # create span to have cyclic links, link has 1 type of class, anchor another - if anchor_tag.attrs["id"] not in self.internal_anchors: - new_anchor_span = self.create_new_anchor_span( - soup, new_id) - anchor_tag.insert_before(new_anchor_span) - self.internal_anchors.add(new_id) - del anchor_tag.attrs["id"] + span_creation() del internal_link_tag.attrs["href"] else: internal_link_tag.attrs["converter-mark"] = "bad-link" - self.logger.log(f"Error in {toc_href}." + self.logger.log(f"Error in {html_href_from_toc}." f" While processing {internal_link_tag} no anchor found." - f" Should be anchor with new id={new_id} in" - f" {a_tag_href_matched_to_toc} file." - f" Old id={a_tag_id}") + f" Should be anchor with new id={new_unique_id} in" + f" {html_href_of_anchor} file." + f" Old id={id_}") # 1. make ids to be unique in all documents make_ids_unique() # 2a. process anchor which is a whole htm|html|xhtml file From b0e5c0334b1671c54d37658273dae64324a21fa9 Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 26 Oct 2022 16:22:10 +0300 Subject: [PATCH 3/4] Add time folder to logs(without time of the build) --- lc_converter.sh | 2 +- src/util/helpers.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lc_converter.sh b/lc_converter.sh index 24430d5..9224b72 100644 --- a/lc_converter.sh +++ b/lc_converter.sh @@ -5,4 +5,4 @@ sudo docker stop lc_converter_container #remove container sudo docker rm -f lc_converter_container #start container -sudo docker run --name=lc_converter_container -v /var/log/lc-converter/$(date +%Y-%m-%d_%H-%M-%S):/app/logs lc_converter_image +sudo docker run --name=lc_converter_container -v /var/log/lc-converter/:/app/logs lc_converter_image diff --git a/src/util/helpers.py b/src/util/helpers.py index ca95606..d5ce7f8 100644 --- a/src/util/helpers.py +++ b/src/util/helpers.py @@ -1,4 +1,5 @@ import os +import time import logging from typing import Union @@ -51,12 +52,17 @@ class BookLogger: self.main_logger = main_logger self.logger = logging.getLogger(name) self.logger.propagate = False + folder_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) - folder_path = os.path.dirname(folder_path) - filename = f"logs/{book_id}.log" + folder_path = os.path.join(os.path.dirname(folder_path), f"logs/{time.strftime('%d-%m-%Y_%H-00')}/") + filename = f"{book_id}.log" file_path = os.path.join(folder_path, filename) + + if not os.path.exists(folder_path): + os.makedirs(folder_path + time.strftime("%Y-%m-%_%H")) file_handler = logging.FileHandler(file_path, mode=filemode) + file_format = logging.Formatter(logging_format) file_handler.setFormatter(file_format) self.logger.addHandler(file_handler) From 4cc1333b80968361c24ab6ae8d18df2550a875dc Mon Sep 17 00:00:00 2001 From: Kiryl Date: Wed, 26 Oct 2022 16:22:32 +0300 Subject: [PATCH 4/4] Little changes in .xml --- ci_build/build.xml | 2 +- src/docx_converter/docx_solver.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ci_build/build.xml b/ci_build/build.xml index 783c94d..b894f00 100644 --- a/ci_build/build.xml +++ b/ci_build/build.xml @@ -1,5 +1,5 @@ - +