Merge pull request #311 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
2022-10-27 10:00:43 +03:00
parent 72902a5824 4cc1333b80
commit 60c73d91aa
6 changed files with 68 additions and 62 deletions
--- a/ci_build/build.xml
+++ b/ci_build/build.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project name="LawCarta converter project" default="full-build">
+<project name="LiveCarta converter project" default="full-build">
 <target name="full-build"
         depends="prepare"
--- a/lc_converter.sh
+++ b/lc_converter.sh
@@ -5,4 +5,4 @@ sudo docker stop  lc_converter_container
 #remove container
 sudo docker rm -f  lc_converter_container
 #start container
-sudo docker run --name=lc_converter_container -v /var/log/lc-converter/$(date +%Y-%m-%d_%H-%M-%S):/app/logs lc_converter_image
+sudo docker run --name=lc_converter_container -v /var/log/lc-converter/:/app/logs lc_converter_image
--- a/src/docx_converter/docx_solver.py
+++ b/src/docx_converter/docx_solver.py
@@ -80,8 +80,7 @@ class DocxBook(BookSolver):
 if __name__ == "__main__":
-
+    docx_file_path = f"../../books/docx/output.docx"
    docx_file_path = "../../books/docx/output.docx"
    logger_object = BookLogger(
        name="docx", book_id=docx_file_path.split("/")[-1])
    locker = Event()
--- a/src/epub_converter/epub_converter.py
+++ b/src/epub_converter/epub_converter.py
@@ -339,7 +339,7 @@ class EpubConverter:
        normed_path = path.normpath(path.join(
            dir_name, href_in_link)).replace("\\", "/")
        full_path = [
-            path for path in self.hrefs_added_to_toc if normed_path in path]
+            href_from_toc for href_from_toc in self.hrefs_added_to_toc if normed_path in href_from_toc]
        if not full_path:
            self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
                            f"While processing href in {internal_link_tag}.")
@@ -378,78 +378,79 @@ class EpubConverter:
        """
        def make_ids_unique():
-            for toc_href in self.hrefs_added_to_toc:
+            for href_from_toc in self.hrefs_added_to_toc:
-                for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
+                for tag in self.html_href2html_body_soup[href_from_toc].find_all(attrs={"id": re.compile(r".+")}):
                    if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
-                        new_id = self.create_unique_id(
+                        new_unique_id = self.create_unique_id(
-                            toc_href, tag.attrs["id"])
+                            href_from_toc, tag.attrs["id"])
-                        tag.attrs["id"] = new_id
+                        tag.attrs["id"] = new_unique_id
        def process_file_anchor():
-            for toc_href in self.hrefs_added_to_toc:
+            def span_creation():
-                soup = self.html_href2html_body_soup[toc_href]
+                if new_unique_id not in self.internal_anchors:
-                for internal_link_tag in soup.find_all("a",
+                    anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
-                                                       {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
+                    new_anchor_span = self.create_new_anchor_span(html_content, new_unique_id)
                    a_tag_href = internal_link_tag.attrs["href"]
                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
                        toc_href, a_tag_href, internal_link_tag)
                    if a_tag_href_matched_to_toc:
                        new_id = self.create_unique_id(
                            a_tag_href_matched_to_toc, "")
                        internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
                        if new_id not in self.internal_anchors:
                            anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
                            new_anchor_span = self.create_new_anchor_span(
                                soup, new_id)
                    # insert a new span to the beginning of the file
-                            anchor_soup.insert(0, new_anchor_span)
+                    anchor_html_content.insert(0, new_anchor_span)
-                            self.internal_anchors.add(new_id)
+                    self.internal_anchors.add(new_unique_id)
            for html_href_from_toc in self.hrefs_added_to_toc:
                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
                for internal_link_tag in html_content.find_all("a",
                                                               {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
                    href_ = internal_link_tag.attrs["href"]
                    html_href_of_anchor = self.match_href_to_path_from_toc(
                                                html_href_from_toc, href_, internal_link_tag)
                    if html_href_of_anchor:
                        new_unique_id = self.create_unique_id(html_href_of_anchor, "")
                        internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
                        span_creation()
                        del internal_link_tag.attrs["href"]
        def process_file_element_anchor():
-            for toc_href in self.hrefs_added_to_toc:
+            def span_creation():
-                soup = self.html_href2html_body_soup[toc_href]
+                if anchor_tag.attrs["id"] not in self.internal_anchors:
-                # process_file_element_anchor
+                    new_anchor_span = self.create_new_anchor_span(
-                for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
+                        html_content, new_unique_id)
-                    a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
+                    anchor_tag.insert_before(new_anchor_span)
-                        "#")
+                    self.internal_anchors.add(new_unique_id)
-                    a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
+                    del anchor_tag.attrs["id"]
                        toc_href, a_tag_href, internal_link_tag) if a_tag_href \
                        else path.normpath(toc_href).replace("\\", "/")
                    if a_tag_href_matched_to_toc:
                        new_id = self.create_unique_id(
                            a_tag_href_matched_to_toc, a_tag_id)
-                        anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
+            for html_href_from_toc in self.hrefs_added_to_toc:
-                        anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
+                html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
-                            anchor_soup.find_all(
+                # process_file_element_anchor
-                                attrs={"id": a_tag_id})  # if link is a footnote
+                for internal_link_tag in html_content.find_all("a",
                                                               {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
                    href_, id_ = internal_link_tag.attrs["href"].split("#")
                    html_href_of_anchor = self.match_href_to_path_from_toc(
                        html_href_from_toc, href_, internal_link_tag) if href_ \
                        else path.normpath(html_href_from_toc).replace("\\", "/")  # the same page
                    if html_href_of_anchor:
                        new_unique_id = self.create_unique_id(html_href_of_anchor, id_)
                        anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
                        anchor_tags = anchor_html_content.find_all(attrs={"id": new_unique_id}) or \
                            anchor_html_content.find_all(attrs={"id": id_})  # if link is a footnote
                        if anchor_tags:
                            if len(anchor_tags) > 1:
-                                self.logger.log(f"Warning in {toc_href}: multiple anchors:"
+                                self.logger.log(f"Warning in {html_href_from_toc}: multiple anchors:"
                                                f"{len(anchor_tags)} found.\n"
                                                f"{anchor_tags}\n"
                                                f"While processing {internal_link_tag}")
                            anchor_tag = anchor_tags[0]
-                            assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
+                            assert anchor_tag.attrs["id"] in [new_unique_id, id_]
                            # if anchor is found we could add placeholder for link creation on server side.
-                            internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
+                            internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
                            # create span to have cyclic links, link has 1 type of class, anchor another
-                            if anchor_tag.attrs["id"] not in self.internal_anchors:
+                            span_creation()
                                new_anchor_span = self.create_new_anchor_span(
                                    soup, new_id)
                                anchor_tag.insert_before(new_anchor_span)
                                self.internal_anchors.add(new_id)
                                del anchor_tag.attrs["id"]
                            del internal_link_tag.attrs["href"]
                        else:
                            internal_link_tag.attrs["converter-mark"] = "bad-link"
-                            self.logger.log(f"Error in {toc_href}."
+                            self.logger.log(f"Error in {html_href_from_toc}."
                                            f" While processing {internal_link_tag} no anchor found."
-                                            f" Should be anchor with new id={new_id} in"
+                                            f" Should be anchor with new id={new_unique_id} in"
-                                            f" {a_tag_href_matched_to_toc} file."
+                                            f" {html_href_of_anchor} file."
-                                            f" Old id={a_tag_id}")
+                                            f" Old id={id_}")
        # 1. make ids to be unique in all documents
        make_ids_unique()
        # 2a. process anchor which is a whole htm|html|xhtml file
--- a/src/style_reader.py
+++ b/src/style_reader.py
@@ -1,6 +1,6 @@
 import re
 import cssutils
-from typing import List, Tuple, Union
+from typing import List, Tuple
 from os.path import dirname, normpath, join
 from src.util.color_reader import str2hex
@@ -74,7 +74,7 @@ class StyleReader:
        def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
            size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
            return str(size_number) + "px"
-        has_size = re.search(r"(\d+)([\w%]+)", size_value)
+        has_size = re.search(r"(\d+(?:\.\d+)?)([\w%]+)", size_value)
        values: List = size_value.split(" ")
        if has_size:
            size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]
--- a/src/util/helpers.py
+++ b/src/util/helpers.py
@@ -1,4 +1,5 @@
 import os
 import time
 import logging
 from typing import Union
@@ -51,12 +52,17 @@ class BookLogger:
        self.main_logger = main_logger
        self.logger = logging.getLogger(name)
        self.logger.propagate = False
        folder_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
-        folder_path = os.path.dirname(folder_path)
+        folder_path = os.path.join(os.path.dirname(folder_path), f"logs/{time.strftime('%d-%m-%Y_%H-00')}/")
-        filename = f"logs/{book_id}.log"
+        filename = f"{book_id}.log"
        file_path = os.path.join(folder_path, filename)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path + time.strftime("%Y-%m-%_%H"))
        file_handler = logging.FileHandler(file_path, mode=filemode)
        file_format = logging.Formatter(logging_format)
        file_handler.setFormatter(file_format)
        self.logger.addHandler(file_handler)