forked from LiveCarta/BookConverter
Merge pull request #311 from Teqniksoft/kiryl/converter_fix
Kiryl/converter fix
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project name="LawCarta converter project" default="full-build">
|
<project name="LiveCarta converter project" default="full-build">
|
||||||
|
|
||||||
<target name="full-build"
|
<target name="full-build"
|
||||||
depends="prepare"
|
depends="prepare"
|
||||||
|
|||||||
@@ -5,4 +5,4 @@ sudo docker stop lc_converter_container
|
|||||||
#remove container
|
#remove container
|
||||||
sudo docker rm -f lc_converter_container
|
sudo docker rm -f lc_converter_container
|
||||||
#start container
|
#start container
|
||||||
sudo docker run --name=lc_converter_container -v /var/log/lc-converter/$(date +%Y-%m-%d_%H-%M-%S):/app/logs lc_converter_image
|
sudo docker run --name=lc_converter_container -v /var/log/lc-converter/:/app/logs lc_converter_image
|
||||||
|
|||||||
@@ -80,8 +80,7 @@ class DocxBook(BookSolver):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
docx_file_path = f"../../books/docx/output.docx"
|
||||||
docx_file_path = "../../books/docx/output.docx"
|
|
||||||
logger_object = BookLogger(
|
logger_object = BookLogger(
|
||||||
name="docx", book_id=docx_file_path.split("/")[-1])
|
name="docx", book_id=docx_file_path.split("/")[-1])
|
||||||
locker = Event()
|
locker = Event()
|
||||||
|
|||||||
@@ -339,7 +339,7 @@ class EpubConverter:
|
|||||||
normed_path = path.normpath(path.join(
|
normed_path = path.normpath(path.join(
|
||||||
dir_name, href_in_link)).replace("\\", "/")
|
dir_name, href_in_link)).replace("\\", "/")
|
||||||
full_path = [
|
full_path = [
|
||||||
path for path in self.hrefs_added_to_toc if normed_path in path]
|
href_from_toc for href_from_toc in self.hrefs_added_to_toc if normed_path in href_from_toc]
|
||||||
if not full_path:
|
if not full_path:
|
||||||
self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
|
self.logger.log(f"Error in {cur_file_path} file. No {normed_path} file found in added to TOC documents. "
|
||||||
f"While processing href in {internal_link_tag}.")
|
f"While processing href in {internal_link_tag}.")
|
||||||
@@ -378,78 +378,79 @@ class EpubConverter:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
def make_ids_unique():
|
def make_ids_unique():
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
for href_from_toc in self.hrefs_added_to_toc:
|
||||||
for tag in self.html_href2html_body_soup[toc_href].find_all(attrs={"id": re.compile(r".+")}):
|
for tag in self.html_href2html_body_soup[href_from_toc].find_all(attrs={"id": re.compile(r".+")}):
|
||||||
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
|
if tag.attrs.get("class") not in ["converter-chapter-mark", "footnote-element"]:
|
||||||
new_id = self.create_unique_id(
|
new_unique_id = self.create_unique_id(
|
||||||
toc_href, tag.attrs["id"])
|
href_from_toc, tag.attrs["id"])
|
||||||
tag.attrs["id"] = new_id
|
tag.attrs["id"] = new_unique_id
|
||||||
|
|
||||||
def process_file_anchor():
|
def process_file_anchor():
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
def span_creation():
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
if new_unique_id not in self.internal_anchors:
|
||||||
for internal_link_tag in soup.find_all("a",
|
anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
|
||||||
{"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
|
new_anchor_span = self.create_new_anchor_span(html_content, new_unique_id)
|
||||||
a_tag_href = internal_link_tag.attrs["href"]
|
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
|
||||||
toc_href, a_tag_href, internal_link_tag)
|
|
||||||
if a_tag_href_matched_to_toc:
|
|
||||||
new_id = self.create_unique_id(
|
|
||||||
a_tag_href_matched_to_toc, "")
|
|
||||||
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
|
||||||
if new_id not in self.internal_anchors:
|
|
||||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
|
||||||
new_anchor_span = self.create_new_anchor_span(
|
|
||||||
soup, new_id)
|
|
||||||
# insert a new span to the beginning of the file
|
# insert a new span to the beginning of the file
|
||||||
anchor_soup.insert(0, new_anchor_span)
|
anchor_html_content.insert(0, new_anchor_span)
|
||||||
self.internal_anchors.add(new_id)
|
self.internal_anchors.add(new_unique_id)
|
||||||
|
|
||||||
|
for html_href_from_toc in self.hrefs_added_to_toc:
|
||||||
|
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
|
||||||
|
for internal_link_tag in html_content.find_all("a",
|
||||||
|
{"href": re.compile(r"(^(?!https?://).+\.(htm|html|xhtml)$)")}):
|
||||||
|
href_ = internal_link_tag.attrs["href"]
|
||||||
|
html_href_of_anchor = self.match_href_to_path_from_toc(
|
||||||
|
html_href_from_toc, href_, internal_link_tag)
|
||||||
|
if html_href_of_anchor:
|
||||||
|
new_unique_id = self.create_unique_id(html_href_of_anchor, "")
|
||||||
|
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
|
||||||
|
span_creation()
|
||||||
del internal_link_tag.attrs["href"]
|
del internal_link_tag.attrs["href"]
|
||||||
|
|
||||||
def process_file_element_anchor():
|
def process_file_element_anchor():
|
||||||
for toc_href in self.hrefs_added_to_toc:
|
def span_creation():
|
||||||
soup = self.html_href2html_body_soup[toc_href]
|
if anchor_tag.attrs["id"] not in self.internal_anchors:
|
||||||
# process_file_element_anchor
|
new_anchor_span = self.create_new_anchor_span(
|
||||||
for internal_link_tag in soup.find_all("a", {"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
|
html_content, new_unique_id)
|
||||||
a_tag_href, a_tag_id = internal_link_tag.attrs["href"].split(
|
anchor_tag.insert_before(new_anchor_span)
|
||||||
"#")
|
self.internal_anchors.add(new_unique_id)
|
||||||
a_tag_href_matched_to_toc = self.match_href_to_path_from_toc(
|
del anchor_tag.attrs["id"]
|
||||||
toc_href, a_tag_href, internal_link_tag) if a_tag_href \
|
|
||||||
else path.normpath(toc_href).replace("\\", "/")
|
|
||||||
if a_tag_href_matched_to_toc:
|
|
||||||
new_id = self.create_unique_id(
|
|
||||||
a_tag_href_matched_to_toc, a_tag_id)
|
|
||||||
|
|
||||||
anchor_soup = self.html_href2html_body_soup[a_tag_href_matched_to_toc]
|
for html_href_from_toc in self.hrefs_added_to_toc:
|
||||||
anchor_tags = anchor_soup.find_all(attrs={"id": new_id}) or \
|
html_content: BeautifulSoup = self.html_href2html_body_soup[html_href_from_toc]
|
||||||
anchor_soup.find_all(
|
# process_file_element_anchor
|
||||||
attrs={"id": a_tag_id}) # if link is a footnote
|
for internal_link_tag in html_content.find_all("a",
|
||||||
|
{"href": re.compile(r"(^.+\.(htm|html|xhtml)#.+)|(^#.+)")}):
|
||||||
|
href_, id_ = internal_link_tag.attrs["href"].split("#")
|
||||||
|
html_href_of_anchor = self.match_href_to_path_from_toc(
|
||||||
|
html_href_from_toc, href_, internal_link_tag) if href_ \
|
||||||
|
else path.normpath(html_href_from_toc).replace("\\", "/") # the same page
|
||||||
|
if html_href_of_anchor:
|
||||||
|
new_unique_id = self.create_unique_id(html_href_of_anchor, id_)
|
||||||
|
|
||||||
|
anchor_html_content = self.html_href2html_body_soup[html_href_of_anchor]
|
||||||
|
anchor_tags = anchor_html_content.find_all(attrs={"id": new_unique_id}) or \
|
||||||
|
anchor_html_content.find_all(attrs={"id": id_}) # if link is a footnote
|
||||||
if anchor_tags:
|
if anchor_tags:
|
||||||
if len(anchor_tags) > 1:
|
if len(anchor_tags) > 1:
|
||||||
self.logger.log(f"Warning in {toc_href}: multiple anchors:"
|
self.logger.log(f"Warning in {html_href_from_toc}: multiple anchors:"
|
||||||
f"{len(anchor_tags)} found.\n"
|
f"{len(anchor_tags)} found.\n"
|
||||||
f"{anchor_tags}\n"
|
f"{anchor_tags}\n"
|
||||||
f"While processing {internal_link_tag}")
|
f"While processing {internal_link_tag}")
|
||||||
|
|
||||||
anchor_tag = anchor_tags[0]
|
anchor_tag = anchor_tags[0]
|
||||||
assert anchor_tag.attrs["id"] in [new_id, a_tag_id]
|
assert anchor_tag.attrs["id"] in [new_unique_id, id_]
|
||||||
# if anchor is found we could add placeholder for link creation on server side.
|
# if anchor is found we could add placeholder for link creation on server side.
|
||||||
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_id + "}}"
|
internal_link_tag.attrs["placeholder"] = "{{tempStyleToAnchor-" + new_unique_id + "}}"
|
||||||
# create span to have cyclic links, link has 1 type of class, anchor another
|
# create span to have cyclic links, link has 1 type of class, anchor another
|
||||||
if anchor_tag.attrs["id"] not in self.internal_anchors:
|
span_creation()
|
||||||
new_anchor_span = self.create_new_anchor_span(
|
|
||||||
soup, new_id)
|
|
||||||
anchor_tag.insert_before(new_anchor_span)
|
|
||||||
self.internal_anchors.add(new_id)
|
|
||||||
del anchor_tag.attrs["id"]
|
|
||||||
del internal_link_tag.attrs["href"]
|
del internal_link_tag.attrs["href"]
|
||||||
else:
|
else:
|
||||||
internal_link_tag.attrs["converter-mark"] = "bad-link"
|
internal_link_tag.attrs["converter-mark"] = "bad-link"
|
||||||
self.logger.log(f"Error in {toc_href}."
|
self.logger.log(f"Error in {html_href_from_toc}."
|
||||||
f" While processing {internal_link_tag} no anchor found."
|
f" While processing {internal_link_tag} no anchor found."
|
||||||
f" Should be anchor with new id={new_id} in"
|
f" Should be anchor with new id={new_unique_id} in"
|
||||||
f" {a_tag_href_matched_to_toc} file."
|
f" {html_href_of_anchor} file."
|
||||||
f" Old id={a_tag_id}")
|
f" Old id={id_}")
|
||||||
# 1. make ids to be unique in all documents
|
# 1. make ids to be unique in all documents
|
||||||
make_ids_unique()
|
make_ids_unique()
|
||||||
# 2a. process anchor which is a whole htm|html|xhtml file
|
# 2a. process anchor which is a whole htm|html|xhtml file
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
import cssutils
|
import cssutils
|
||||||
from typing import List, Tuple, Union
|
from typing import List, Tuple
|
||||||
from os.path import dirname, normpath, join
|
from os.path import dirname, normpath, join
|
||||||
|
|
||||||
from src.util.color_reader import str2hex
|
from src.util.color_reader import str2hex
|
||||||
@@ -74,7 +74,7 @@ class StyleReader:
|
|||||||
def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
|
def convert_size_number(size_number: str, unit_to_replace: str, multiplier: float) -> str:
|
||||||
size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
|
size_number = float(size_number.replace(unit_to_replace, "")) * multiplier
|
||||||
return str(size_number) + "px"
|
return str(size_number) + "px"
|
||||||
has_size = re.search(r"(\d+)([\w%]+)", size_value)
|
has_size = re.search(r"(\d+(?:\.\d+)?)([\w%]+)", size_value)
|
||||||
values: List = size_value.split(" ")
|
values: List = size_value.split(" ")
|
||||||
if has_size:
|
if has_size:
|
||||||
size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]
|
size_number_idx = [i for i, value in enumerate(values) if re.search("(\d+)([\w%]+)", value)][0]
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import time
|
||||||
import logging
|
import logging
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
@@ -51,12 +52,17 @@ class BookLogger:
|
|||||||
self.main_logger = main_logger
|
self.main_logger = main_logger
|
||||||
self.logger = logging.getLogger(name)
|
self.logger = logging.getLogger(name)
|
||||||
self.logger.propagate = False
|
self.logger.propagate = False
|
||||||
|
|
||||||
folder_path = os.path.dirname(
|
folder_path = os.path.dirname(
|
||||||
os.path.dirname(os.path.abspath(__file__)))
|
os.path.dirname(os.path.abspath(__file__)))
|
||||||
folder_path = os.path.dirname(folder_path)
|
folder_path = os.path.join(os.path.dirname(folder_path), f"logs/{time.strftime('%d-%m-%Y_%H-00')}/")
|
||||||
filename = f"logs/{book_id}.log"
|
filename = f"{book_id}.log"
|
||||||
file_path = os.path.join(folder_path, filename)
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
|
||||||
|
if not os.path.exists(folder_path):
|
||||||
|
os.makedirs(folder_path + time.strftime("%Y-%m-%_%H"))
|
||||||
file_handler = logging.FileHandler(file_path, mode=filemode)
|
file_handler = logging.FileHandler(file_path, mode=filemode)
|
||||||
|
|
||||||
file_format = logging.Formatter(logging_format)
|
file_format = logging.Formatter(logging_format)
|
||||||
file_handler.setFormatter(file_format)
|
file_handler.setFormatter(file_format)
|
||||||
self.logger.addHandler(file_handler)
|
self.logger.addHandler(file_handler)
|
||||||
|
|||||||
Reference in New Issue
Block a user