Merge pull request #311 from Teqniksoft/kiryl/converter_fix

Kiryl/converter fix
This commit is contained in:
bivis
2022-10-27 10:00:43 +03:00
committed by GitHub
6 changed files with 68 additions and 62 deletions

View File

@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project name="LawCarta converter project" default="full-build"> <project name="LiveCarta converter project" default="full-build">
<target name="full-build" <target name="full-build"
depends="prepare" depends="prepare"

View File

@@ -5,4 +5,4 @@ sudo docker stop lc_converter_container
#remove container #remove container
sudo docker rm -f lc_converter_container sudo docker rm -f lc_converter_container
#start container #start container
sudo docker run --name=lc_converter_container -v /var/log/lc-converter/$(date +%Y-%m-%d_%H-%M-%S):/app/logs lc_converter_image sudo docker run --name=lc_converter_container -v /var/log/lc-converter/:/app/logs lc_converter_image

View File

@@ -80,8 +80,7 @@ class DocxBook(BookSolver):
if __name__ == "__main__": if __name__ == "__main__":
docx_file_path = f"../../books/docx/output.docx"
docx_file_path = "../../books/docx/output.docx"
logger_object = BookLogger( logger_object = BookLogger(
name="docx", book_id=docx_file_path.split("/")[-1]) name="docx", book_id=docx_file_path.split("/")[-1])
locker = Event() locker = Event()

View File

@@ -339,7 +339,7 @@ class EpubConverter:
normed_path = path.normpath(path.join( normed_path = path.normpath(path.join(
dir_name, href_in_link)).replace("\\", "/") dir_name, href_in_link)).replace("\\", "/")
full_path = [ full_path = [
path for path in self.hrefs_added_to_toc if normed_path in path] href_from_toc for href_from_toc in self.hrefs_added_to_toc if normed_path in href_from_toc]
if not full_path: if not full_path:
self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. " self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
f"While processing href in {internal_link_tag}.") f"While processing href in {internal_link_tag}.")
@@ -378,78 +378,79 @@ class EpubConverter:
""" """
def make_ids_unique(): def make_ids_unique():
for toc_href in self.hrefs_added_to_toc: for href_from_toc in self.hrefs_added_to_toc:
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}): for tag in self.html_href2html_body_soup[href_from_toc].find_all(attrs={"id": re.compile(r".+")}):
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]: if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
new_id = self.create_unique_id( new_unique_id = self.create_unique_id(
toc_href, tag.attrs["id"]) href_from_toc, tag.attrs["id"])
tag.attrs["id"] = new_id tag.attrs["id"] = new_unique_id
def process_file_anchor(): def process_file_anchor():
for toc_href in self.hrefs_added_to_toc: def span_creation():
soup = self.html_href2html_body_soup[toc_href] if new_unique_id not in self.internal_anchors:
for internal_link_tag in soup.find_all("a", anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
{"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}): new_anchor_span = self.create_new_anchor_span(html_content, new_unique_id)
a_tag_href = internal_link_tag.attrs["href"] # insert a new span to the beginning of the file
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( anchor_html_content.insert(0, new_anchor_span)
toc_href, a_tag_href, internal_link_tag) self.internal_anchors.add(new_unique_id)
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id( for html_href_from_toc in self.hrefs_added_to_toc:
a_tag_href_matched_to_toc, "") html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" for internal_link_tag in html_content.find_all("a",
if new_id not in self.internal_anchors: {"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] href_ = internal_link_tag.attrs["href"]
new_anchor_span = self.create_new_anchor_span( html_href_of_anchor = self.match_href_to_path_from_toc(
soup, new_id) html_href_from_toc, href_, internal_link_tag)
# insert a new span to the beginning of the file if html_href_of_anchor:
anchor_soup.insert(0, new_anchor_span) new_unique_id = self.create_unique_id(html_href_of_anchor, "")
self.internal_anchors.add(new_id) internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
span_creation()
del internal_link_tag.attrs["href"] del internal_link_tag.attrs["href"]
def process_file_element_anchor(): def process_file_element_anchor():
for toc_href in self.hrefs_added_to_toc: def span_creation():
soup = self.html_href2html_body_soup[toc_href] if anchor_tag.attrs["id"] not in self.internal_anchors:
# process_file_element_anchor new_anchor_span = self.create_new_anchor_span(
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}): html_content, new_unique_id)
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split( anchor_tag.insert_before(new_anchor_span)
"#") self.internal_anchors.add(new_unique_id)
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc( del anchor_tag.attrs["id"]
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
else path.normpath(toc_href).replace("\\", "/")
if a_tag_href_matched_to_toc:
new_id = self.create_unique_id(
a_tag_href_matched_to_toc, a_tag_id)
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc] for html_href_from_toc in self.hrefs_added_to_toc:
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \ html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
anchor_soup.find_all( # process_file_element_anchor
attrs={"id": a_tag_id}) # if link is a footnote for internal_link_tag in html_content.find_all("a",
{"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
href_, id_ = internal_link_tag.attrs["href"].split("#")
html_href_of_anchor = self.match_href_to_path_from_toc(
html_href_from_toc, href_, internal_link_tag) if href_ \
else path.normpath(html_href_from_toc).replace("\\", "/") # the same page
if html_href_of_anchor:
new_unique_id = self.create_unique_id(html_href_of_anchor, id_)
anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
anchor_tags = anchor_html_content.find_all(attrs={"id": new_unique_id}) or \
anchor_html_content.find_all(attrs={"id": id_}) # if link is a footnote
if anchor_tags: if anchor_tags:
if len(anchor_tags) > 1: if len(anchor_tags) > 1:
self.logger.log(f"Warning in {toc_href}: multiple anchors:" self.logger.log(f"Warning in {html_href_from_toc}: multiple anchors:"
f"{len(anchor_tags)} found.\n" f"{len(anchor_tags)} found.\n"
f"{anchor_tags}\n" f"{anchor_tags}\n"
f"While processing {internal_link_tag}") f"While processing {internal_link_tag}")
anchor_tag = anchor_tags[0] anchor_tag = anchor_tags[0]
assert anchor_tag.attrs["id"] in [new_id, a_tag_id] assert anchor_tag.attrs["id"] in [new_unique_id, id_]
# if anchor is found we could add placeholder for link creation on server side. # if anchor is found we could add placeholder for link creation on server side.
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}" internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
# create span to have cyclic links, link has 1 type of class, anchor another # create span to have cyclic links, link has 1 type of class, anchor another
if anchor_tag.attrs["id"] not in self.internal_anchors: span_creation()
new_anchor_span = self.create_new_anchor_span(
soup, new_id)
anchor_tag.insert_before(new_anchor_span)
self.internal_anchors.add(new_id)
del anchor_tag.attrs["id"]
del internal_link_tag.attrs["href"] del internal_link_tag.attrs["href"]
else: else:
internal_link_tag.attrs["converter-mark"] = "bad-link" internal_link_tag.attrs["converter-mark"] = "bad-link"
self.logger.log(f"Error in {toc_href}." self.logger.log(f"Error in {html_href_from_toc}."
f" While processing {internal_link_tag} no anchor found." f" While processing {internal_link_tag} no anchor found."
f" Should be anchor with new id={new_id} in" f" Should be anchor with new id={new_unique_id} in"
f" {a_tag_href_matched_to_toc} file." f" {html_href_of_anchor} file."
f" Old id={a_tag_id}") f" Old id={id_}")
# 1. make ids to be unique in all documents # 1. make ids to be unique in all documents
make_ids_unique() make_ids_unique()
# 2a. process anchor which is a whole htm|html|xhtml file # 2a. process anchor which is a whole htm|html|xhtml file

View File

@@ -1,6 +1,6 @@
import re import re
import cssutils import cssutils
from typing import List, Tuple, Union from typing import List, Tuple
from os.path import dirname, normpath, join from os.path import dirname, normpath, join
from src.util.color_reader import str2hex from src.util.color_reader import str2hex
@@ -74,7 +74,7 @@ class StyleReader:
def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str: def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
size_number = float(size_number.replace(unit_to_replace, "")) * multiplier size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
return str(size_number) + "px" return str(size_number) + "px"
has_size = re.search(r"(\d+)([\w%]+)", size_value) has_size = re.search(r"(\d+(?:\.\d+)?)([\w%]+)", size_value)
values: List = size_value.split(" ") values: List = size_value.split(" ")
if has_size: if has_size:
size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0] size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]

View File

@@ -1,4 +1,5 @@
import os import os
import time
import logging import logging
from typing import Union from typing import Union
@@ -51,12 +52,17 @@ class BookLogger:
self.main_logger = main_logger self.main_logger = main_logger
self.logger = logging.getLogger(name) self.logger = logging.getLogger(name)
self.logger.propagate = False self.logger.propagate = False
folder_path = os.path.dirname( folder_path = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) os.path.dirname(os.path.abspath(__file__)))
folder_path = os.path.dirname(folder_path) folder_path = os.path.join(os.path.dirname(folder_path), f"logs/{time.strftime('%d-%m-%Y_%H-00')}/")
filename = f"logs/{book_id}.log" filename = f"{book_id}.log"
file_path = os.path.join(folder_path, filename) file_path = os.path.join(folder_path, filename)
if not os.path.exists(folder_path):
os.makedirs(folder_path + time.strftime("%Y-%m-%_%H"))
file_handler = logging.FileHandler(file_path, mode=filemode) file_handler = logging.FileHandler(file_path, mode=filemode)
file_format = logging.Formatter(logging_format) file_format = logging.Formatter(logging_format)
file_handler.setFormatter(file_format) file_handler.setFormatter(file_format)
self.logger.addHandler(file_handler) self.logger.addHandler(file_handler)