quick fix

This commit is contained in:
shirshasa
2020-08-28 13:44:43 +03:00
parent f27eefb96b
commit b66ef6296f
3 changed files with 14 additions and 18 deletions

View File

@@ -32,7 +32,7 @@ class Book:
main_logger=main_logger) main_logger=main_logger)
self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id) self.book_api_wrapper = BookApiWrapper(access, self.logger_object, book_id)
assert BookConfig.SUPPORTED_LEVELS == len(BookConfig.SUPPORTED_HEADERS), \ assert LawCartaConfig.SUPPORTED_LEVELS == len(LawCartaConfig.SUPPORTED_HEADERS), \
"Length of headers doesn't match allowed levels." "Length of headers doesn't match allowed levels."
def save_docx(self, content): def save_docx(self, content):

View File

@@ -5,7 +5,7 @@ import re
from shutil import copyfile from shutil import copyfile
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup, NavigableString
from config import BookConfig, BookLogger, BookApiWrapper from config import LawCartaConfig, BookLogger, BookApiWrapper
class HTMLPreprocessor: class HTMLPreprocessor:
@@ -49,8 +49,8 @@ class HTMLPreprocessor:
@classmethod @classmethod
def convert_pt_to_px(cls, value): def convert_pt_to_px(cls, value):
value = int(value) value = int(value)
if value == BookConfig.WORD_DEFAULT_FONT_SIZE: if value == LawCartaConfig.WORD_DEFAULT_FONT_SIZE:
return BookConfig.LAWCARTA_DEFAULT_FONT_SIZE return LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE
else: else:
return value return value
@@ -70,7 +70,7 @@ class HTMLPreprocessor:
size = size.group(1) size = size.group(1)
new_size = cls.convert_pt_to_px(size) new_size = cls.convert_pt_to_px(size)
if new_size == BookConfig.LAWCARTA_DEFAULT_FONT_SIZE: if new_size == LawCartaConfig.LAWCARTA_DEFAULT_FONT_SIZE:
return "" return ""
return re.sub(size + "pt", str(new_size) + "px", style) return re.sub(size + "pt", str(new_size) + "px", style)
@@ -178,7 +178,7 @@ class HTMLPreprocessor:
p.attrs = {} p.attrs = {}
style = '' style = ''
if align is not None and align != BookConfig.DEFAULT_ALIGN_STYLE: if align is not None and align != LawCartaConfig.DEFAULT_ALIGN_STYLE:
style += f'text-align: {align};' style += f'text-align: {align};'
if indent is not None: if indent is not None:
@@ -280,10 +280,6 @@ class HTMLPreprocessor:
tag.string = tag.text.replace('\u200c', '') tag.string = tag.text.replace('\u200c', '')
tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '') tag['href'] = tag.attrs.get('href').replace('%E2%80%8C', '')
# %E2%80%8C
for tag in a_tags_with_href:
print(tag)
@staticmethod @staticmethod
def _clean_footnote_content(content): def _clean_footnote_content(content):
content = content.strip() content = content.strip()
@@ -433,7 +429,7 @@ class HTMLPreprocessor:
""" """
Function to convert all lower level headings to p tags Function to convert all lower level headings to p tags
""" """
pattern = f'^h[{BookConfig.SUPPORTED_LEVELS + 1}-9]$' pattern = f'^h[{LawCartaConfig.SUPPORTED_LEVELS + 1}-9]$'
header_tags = self.body_tag.find_all(re.compile(pattern)) header_tags = self.body_tag.find_all(re.compile(pattern))
for tag in header_tags: for tag in header_tags:
tag.name = 'p' tag.name = 'p'
@@ -521,8 +517,8 @@ class HTMLPreprocessor:
if title == "": if title == "":
tag.unwrap() tag.unwrap()
else: else:
assert tag.name in BookConfig.SUPPORTED_HEADERS, \ assert tag.name in LawCartaConfig.SUPPORTED_HEADERS, \
f'Preprocessing went wrong, there is still h{BookConfig.SUPPORTED_LEVELS + 1}-h9 headings.' f'Preprocessing went wrong, there is still h{LawCartaConfig.SUPPORTED_LEVELS + 1}-h9 headings.'
# if tag.name in ["h4", "h5", "h6"]: # if tag.name in ["h4", "h5", "h6"]:
# tag.name = "h3" # All the lower level headings will be transformed to h3 headings # tag.name = "h3" # All the lower level headings will be transformed to h3 headings

View File

@@ -4,7 +4,7 @@ import codecs
import json import json
from copy import copy from copy import copy
from config import BookConfig from config import LawCartaConfig
class JSONConverter: class JSONConverter:
@@ -34,7 +34,7 @@ class JSONConverter:
:param ind: Index of header in content list. :param ind: Index of header in content list.
""" """
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
title = self.content[ind].text title = self.content[ind].text
curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag curr_outline = int(re.sub(r"^h", "", self.content[ind].name)) # extract outline from tag
result = { result = {
@@ -47,7 +47,7 @@ class JSONConverter:
while ind < len(self.content): while ind < len(self.content):
# 1. next tag is a header # 1. next tag is a header
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
outline = int(re.sub(r"^h", "", self.content[ind].name)) outline = int(re.sub(r"^h", "", self.content[ind].name))
# - recursion step until h_i > h_initial # - recursion step until h_i > h_initial
if outline > curr_outline: if outline > curr_outline:
@@ -100,13 +100,13 @@ class JSONConverter:
while ind < len(self.content): while ind < len(self.content):
res = {} res = {}
if self.content[ind].name in BookConfig.SUPPORTED_HEADERS: if self.content[ind].name in LawCartaConfig.SUPPORTED_HEADERS:
res, ind = self.header_to_json(ind) res, ind = self.header_to_json(ind)
else: else:
chapter_title = f'Untitled chapter {ch_num}' chapter_title = f'Untitled chapter {ch_num}'
chapter = [] chapter = []
while ind < len(self.content) and self.content[ind].name not in BookConfig.SUPPORTED_HEADERS: while ind < len(self.content) and self.content[ind].name not in LawCartaConfig.SUPPORTED_HEADERS:
if not self._is_empty_p_tag(self.content[ind]): if not self._is_empty_p_tag(self.content[ind]):
chapter.append(self.format_html(str(self.content[ind]))) chapter.append(self.format_html(str(self.content[ind])))
ind += 1 ind += 1