diff --git a/src/modm_data/html/document.py b/src/modm_data/html/document.py index 3371819..4335fb5 100644 --- a/src/modm_data/html/document.py +++ b/src/modm_data/html/document.py @@ -1,7 +1,7 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -import re +import re, os import logging from pathlib import Path from functools import cached_property @@ -13,7 +13,7 @@ class Document: def __init__(self, path: str): self.path = Path(path) - self.relpath = self.path.relative_to(Path().cwd()) + self.relpath = os.path.relpath(self.path, Path().cwd()) self.fullname = self.path.stem self.name = self.fullname.split("-")[0] self.version = self.fullname.split("-")[1] @@ -41,6 +41,8 @@ def chapter(self, pattern: str) -> Chapter: LOGGER.error(f"Cannot find chapter with pattern '{pattern}'!") if len(chapters) > 1: LOGGER.error(f"Found multiple chapters with pattern '{pattern}'!") + for chapter in chapters: + LOGGER.error(f" - {chapter.name}") assert len(chapters) == 1 return chapters[0] diff --git a/src/modm_data/html/stmicro/__init__.py b/src/modm_data/html/stmicro/__init__.py index b70414f..bca0b82 100644 --- a/src/modm_data/html/stmicro/__init__.py +++ b/src/modm_data/html/stmicro/__init__.py @@ -1,7 +1,8 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -from .datasheet import DatasheetMicro, DatasheetSensor +from .datasheet_sensor import DatasheetSensor +from .datasheet_stm32 import DatasheetStm32 from .reference import ReferenceManual from .document import load_documents, load_document_devices from .document import datasheet_for_device, reference_manual_for_device diff --git a/src/modm_data/html/stmicro/datasheet_sensor.py b/src/modm_data/html/stmicro/datasheet_sensor.py new file mode 100644 index 0000000..adbb57e --- /dev/null +++ b/src/modm_data/html/stmicro/datasheet_sensor.py @@ -0,0 +1,25 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +import re +import itertools +from pathlib import Path +from functools import cached_property, cache +from collections import defaultdict + +from .helper import split_device_filter, split_package +from ...html.text import ReDict + +import modm_data.html as html + + +class DatasheetSensor(html.Document): + def __init__(self, path: str): + super().__init__(path) + + def __repr__(self) -> str: + return f"DSsensor({self.fullname})" + + @cache + def register_map(self, assert_table=True): + pass diff --git a/src/modm_data/html/stmicro/datasheet.py b/src/modm_data/html/stmicro/datasheet_stm32.py similarity index 97% rename from src/modm_data/html/stmicro/datasheet.py rename to src/modm_data/html/stmicro/datasheet_stm32.py index 5fe4147..c77f16d 100644 --- a/src/modm_data/html/stmicro/datasheet.py +++ b/src/modm_data/html/stmicro/datasheet_stm32.py @@ -14,14 +14,14 @@ import modm_data.html as html -class DatasheetMicro(html.Document): +class DatasheetStm32(html.Document): def __init__(self, path: str): super().__init__(path) self._id = {} self._devices = {} def __repr__(self) -> str: - return f"DSµC({self.fullname})" + return f"DSstm32({self.fullname})" @cached_property def device_family(self) -> str: @@ -247,11 +247,3 @@ def packages_pins(self): data_pin["alternate"][af].extend(signals) return data_packages, data_pins - - -class DatasheetSensor(html.Document): - def __init__(self, path: str): - super().__init__(path) - - def __repr__(self) -> str: - return f"DSsens({self.fullname})" diff --git a/src/modm_data/html/stmicro/document.py b/src/modm_data/html/stmicro/document.py index ade7d16..3fa7649 100644 --- a/src/modm_data/html/stmicro/document.py +++ b/src/modm_data/html/stmicro/document.py @@ -5,7 +5,8 @@ from collections import defaultdict from ...html import Document from ...utils import cache_path, ext_path -from .datasheet import DatasheetMicro, DatasheetSensor +from .datasheet_stm32 import DatasheetStm32 +from .datasheet_sensor import DatasheetSensor from .reference import ReferenceManual from ...owl import DeviceIdentifier from ...owl.stmicro import did_from_string @@ -27,7 +28,7 @@ def load_documents() -> list: # FIXME: Better detection that DS13252 is a STM32WB55 module, not a chip! if any("STM32" in h.html for h in chap[0].headings()) and \ "DS13252" not in doc.name and "DS14096" not in doc.name: - documents[doc.name][doc.version] = DatasheetMicro(path) + documents[doc.name][doc.version] = DatasheetStm32(path) else: documents[doc.name][doc.version] = DatasheetSensor(path) elif "RM" in doc.name: @@ -35,7 +36,7 @@ def load_documents() -> list: return documents -def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, DatasheetMicro], +def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, DatasheetStm32], dict[DeviceIdentifier, ReferenceManual]]: global DOCUMENT_CACHE if DOCUMENT_CACHE is not None: @@ -48,7 +49,7 @@ def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, Datas docs = {} for path in set(json_data["ds"].values()): - docs[path] = DatasheetMicro(path) + docs[path] = DatasheetStm32(path) for path in set(json_data["rm"].values()): docs[path] = ReferenceManual(path) datasheets = {did_from_string(did): docs[path] @@ -63,7 +64,7 @@ def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, Datas doc = list(versions.values())[-1] # print(doc.path_pdf.relative_to(Path().cwd()), doc.path.relative_to(Path().cwd())) # print(doc.devices) - if isinstance(doc, DatasheetMicro): + if isinstance(doc, DatasheetStm32): if not doc.devices: raise ValueError(f"{doc} has no associated devices!") for dev in doc.devices: @@ -120,7 +121,7 @@ def _document_for_device(did: DeviceIdentifier, documents): return None -def datasheet_for_device(did: DeviceIdentifier) -> DatasheetMicro: +def datasheet_for_device(did: DeviceIdentifier) -> DatasheetStm32: datasheets, _ = load_document_devices() return _document_for_device(did, datasheets) diff --git a/src/modm_data/html2owl/stmicro/__main__.py b/src/modm_data/html2owl/stmicro/__main__.py index bea1104..0d11c32 100644 --- a/src/modm_data/html2owl/stmicro/__main__.py +++ b/src/modm_data/html2owl/stmicro/__main__.py @@ -9,7 +9,7 @@ from collections import defaultdict from multiprocessing.pool import ThreadPool -from modm_data.html.stmicro import DatasheetMicro, ReferenceManual, load_documents +from modm_data.html.stmicro import DatasheetStm32, ReferenceManual, load_documents from modm_data.owl import Store from modm_data.py2owl.stmicro import owl_from_doc @@ -25,7 +25,7 @@ def main(): for name, versions in load_documents().items(): # always use latest version for now doc = list(versions.values())[-1] - if isinstance(doc, DatasheetMicro): + if isinstance(doc, DatasheetStm32): docs.append(doc) elif isinstance(doc, ReferenceManual): docs.append(doc) @@ -40,7 +40,7 @@ def main(): path = Path(args.document).absolute() if path.stem.startswith("DS"): - doc = DatasheetMicro(path) + doc = DatasheetStm32(path) elif path.stem.startswith("RM"): doc = ReferenceManual(path) diff --git a/src/modm_data/html2svd/stmicro/__init__.py b/src/modm_data/html2svd/stmicro/__init__.py index 22e4fbf..ee08e88 100644 --- a/src/modm_data/html2svd/stmicro/__init__.py +++ b/src/modm_data/html2svd/stmicro/__init__.py @@ -2,3 +2,4 @@ # SPDX-License-Identifier: MPL-2.0 from .reference import memory_map_from_reference_manual +from .datasheet import memory_map_from_datasheet diff --git a/src/modm_data/html2svd/stmicro/__main__.py b/src/modm_data/html2svd/stmicro/__main__.py index 80347d1..9f2270c 100644 --- a/src/modm_data/html2svd/stmicro/__main__.py +++ b/src/modm_data/html2svd/stmicro/__main__.py @@ -8,8 +8,8 @@ from pathlib import Path from multiprocessing.pool import ThreadPool -from modm_data.html.stmicro import ReferenceManual, load_documents -from modm_data.html2svd.stmicro import memory_map_from_reference_manual +from modm_data.html.stmicro import ReferenceManual, DatasheetSensor, load_documents +from modm_data.html2svd.stmicro import memory_map_from_reference_manual, memory_map_from_datasheet from modm_data.svd import format_svd, write_svd from modm_data.utils import ext_path from anytree import RenderTree @@ -17,7 +17,8 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument("--document", type=str, default="") + parser.add_argument("--stm32", type=Path) + parser.add_argument("--sensor", type=Path) parser.add_argument("--all", action="store_true", default=False) args = parser.parse_args() @@ -30,7 +31,7 @@ def main(): docs.append(doc) Path("log/stmicro/svd").mkdir(exist_ok=True, parents=True) - calls = [f"python3 -m modm_data.html2svd.stmicro --document {doc.path} " + calls = [f"python3 -m modm_data.html2svd.stmicro --stm32 {doc.path} " f"> log/stmicro/svd/html_{doc.name}.txt 2>&1" for doc in docs] with ThreadPool() as pool: retvals = list(tqdm.tqdm(pool.imap(lambda c: subprocess.run(c, shell=True), calls), total=len(calls))) @@ -38,12 +39,17 @@ def main(): if retval.returncode != 0: print(call) return all(r.returncode == 0 for r in retvals) - path = Path(args.document).absolute() - doc = ReferenceManual(path) + if args.stm32: + doc = ReferenceManual(args.stm32.absolute()) + elif args.sensor: + doc = DatasheetSensor(args.sensor.absolute()) print(doc.path_pdf.relative_to(Path().cwd()), doc.path.relative_to(Path().cwd())) - mmaptrees = memory_map_from_reference_manual(doc) + if args.stm32: + mmaptrees = memory_map_from_reference_manual(doc) + elif args.sensor: + mmaptrees = memory_map_from_datasheet(doc) for mmaptree in mmaptrees: print(RenderTree(mmaptree, maxlevel=2)) svd = format_svd(mmaptree) diff --git a/src/modm_data/html2svd/stmicro/datasheet.py b/src/modm_data/html2svd/stmicro/datasheet.py new file mode 100644 index 0000000..35e93f4 --- /dev/null +++ b/src/modm_data/html2svd/stmicro/datasheet.py @@ -0,0 +1,379 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +import re +from functools import cached_property +from collections import defaultdict +from anytree import RenderTree + +from ...html.stmicro.helper import split_device_filter +from ...svd import * +from ...header2svd.stmicro.tree import _normalize_order +from ...cubemx import cubemx_device_list +from ...html import replace as html_replace + + +def _deduplicate_bit_fields(bit_fields): + named_fields = defaultdict(set) + for field in sorted(bit_fields, key=lambda f: f.position): + named_fields[field.name].add(field.position) + + new_fields = [] + for name, positions in named_fields.items(): + position = min(positions) + width = max(positions) + 1 - position + new_fields.append(BitField(name, position, width)) + + return new_fields + + +def _peripheral_map_to_tree(chapter, peripheral_maps): + cap_replace = {"STM32F415/417xx": "STM32F415/417"} + + peripheral_trees = [] + for caption, (heading, register_map) in peripheral_maps.items(): + print(caption) + if match := re.search(f"OTG_[FH]S", caption): + replace_name = peripheral_name = "OTG" + elif match := re.search(f"JPEG", caption): + replace_name = peripheral_name = "JPEG" + elif match := re.search(f"CCU ", caption): + peripheral_name = "CANCCU" + replace_name = "FDCAN_CCU" + else: + peripheral_names = {n.split("_")[0] for n in register_map.keys()} + replace_name = peripheral_name = list(sorted(peripheral_names))[-1] + if all(p.startswith("COMP") for p in peripheral_names): + peripheral_name = "COMP" + replace_name = "" + if all(p.startswith("OPAMP") for p in peripheral_names): + peripheral_name = "OPAMP" + replace_name = "" + elif len(peripheral_names) > 1: + print(f"Multiple peripheral names detected: {peripheral_names}") + + if peripheral_name == "M7": continue + # Some chapters have multiple tables for multiple instances + filters = defaultdict(set) + instances = set() + if peripheral_name.startswith("LPTIM"): + replace_name = peripheral_name = "LPTIM" + elif peripheral_name.startswith("DLYB"): + instances.add("DLYB") + elif peripheral_name.startswith("TIM"): + peripheral_name = "TIM" + if match := re.search(r"TIM(\d+) +to +TIM(\d+)", caption): + irange = list(sorted([int(match.group(1)), int(match.group(2))])) + irange = range(irange[0], irange[1] + 1) + instances.add(f"TIM({'|'.join(map(str, irange))})") + for pfilter in re.findall(r"TIM\d+(?:/\d+)*", caption): + if "/" in pfilter: + pfilter = f"TIM({pfilter[3:].replace('/', '|')})" + instances.add(f"^{pfilter}$") + elif "GPIOx" in peripheral_name: + peripheral_name = "GPIO" + for pfilter in re.findall(r"GPIO[A-Z](?:[/A-Z]+]+)?", caption): + if "/" in pfilter: + pfilter = f"GPIO({pfilter[4:].replace('/', '|')})" + instances.add(pfilter) + if instances: + filters["instances"].update(instances) + + devices = set() + for pfilter in re.findall(r"STM32[\w/]+", html_replace(caption, **cap_replace)): + devices.update(split_device_filter(pfilter) if "/" in pfilter else [pfilter]) + if devices: + filters["devices"].update(d.replace("x", ".") for d in devices) + + if "connectivity line" in chapter.name: + filters["devices"].add("STM32F10[57]") + elif "low medium high and xl density" in chapter.name: + filters["devices"].add("STM32F10[123]") + + peripheral_type = PeripheralType(peripheral_name, _chapter=chapter, + filters=dict(filters), section=heading) + for rname, (offset, bitfields) in register_map.items(): + filters = {} + if replace_name: + if replace_name == "OTG" and (match := re.match("^OTG_[FH]S", rname)): + filters["instances"] = {match.group(0)} + nrname = rname.replace(match.group(0) + "_", "") + else: + nrname = rname.replace(replace_name + "_", "") + if len(rname) == len(nrname) and "_" in rname: + instance = rname.split("_")[0] + filters["instances"] = {instance+"$"} + nrname = rname.replace(instance + "_", "") + print(instance, nrname) + rname = nrname + if match := re.match("(.*?)connectivitylinedevices", rname): + rname = match.group(1) + filters["devices"] = {r"STM32F10[57]"} + elif match := re.match("(.*?)low,medium,highandXLdensitydevices", rname): + rname = match.group(1) + filters["devices"] = {r"STM32F10[123]"} + try: offset = int(offset, 16) + except: pass + register_type = Register(rname, offset, filters=filters, parent=peripheral_type) + fields = [BitField(field, bit) for bit, field in bitfields.items()] + register_type.children = _deduplicate_bit_fields(fields) + + peripheral_trees.append(peripheral_type) + + return peripheral_trees + + +def _expand_register_offsets(peripheral_trees): + for peripheral in peripheral_trees: + unexpanded = defaultdict(list) + for register in peripheral.children: + if (isinstance(register.offset, str) or + ("CAN" in peripheral.name and "F1R2" in register.name) or + ("GFXMMU" in peripheral.name and "LUT0L" in register.name) or + ("GFXMMU" in peripheral.name and "LUT0H" in register.name) or + ("HSEM" in peripheral.name and "R1" in register.name)): + unexpanded[str(register.offset)].append(register) + for offsets, registers in unexpanded.items(): + print(offsets, registers) + + conv = lambda i: int(i, 16) + # if match := re.search(r"x=([\d,]+)", registers[0].name): + # offsets = [offsets] * len(match.group(1).split(",")) + if any(pat in offsets for pat in ["x=", "channelnumber"]): + if matches := re.findall(r"(0x[\dA-Fa-f]+)\(x=\w+\)", offsets): + orange = enumerate(map(conv, matches)) + formula = "x" + elif "channelnumber" in offsets: + orange = enumerate(range(0, 16)) + formula = offsets.replace("channelnumber", "x") + elif "moni-ringunitnumber" in offsets: + orange = [(i, i) for i in range(1, 6)] + formula = offsets.split("(x=")[0] + else: + match = re.search(r"\(x=(\d+)(?:-\.?|\.\.)(\d+)", offsets) + orange = [(i, i) for i in range(int(match.group(1)), int(match.group(2)) + 1)] + formula = re.split(r"\(x=|,", offsets)[0] + offsets = [(ii, eval(formula, None, {"x": x})) for ii, x in orange] + print(formula, offsets, orange) + elif "-" in offsets: + omin, omax = list(map(conv, offsets.split("-"))) + offsets = enumerate(range(omin, omax+1, 4)) + elif "or" in offsets: + offsets = enumerate(list(map(conv, offsets.split("or")))) + elif "F1R2" in registers[0].name: + offsets = enumerate(range(int(offsets), int(offsets)+4*25*2+1, 4)) + elif "LUT0" in registers[0].name: + offsets = enumerate(range(int(offsets), int(offsets)+4*2044+1, 8)) + elif "HSEM" in peripheral.name: + print(offsets) + offsets = enumerate(range(int(offsets), int(offsets)+4*29+1, 4)) + else: + print(f"Unknown expansion format for {offsets}!") + return False + + fields = registers[0].children + if all(re.match(r"BKP\d+R", r.name) for r in registers): + name_template = lambda i: f"BKP{i}R" + elif "SAI" in peripheral.name: + name_template = lambda i: f"{registers[0].name[1:]}{chr(i+ord('A'))}" + elif "HRTIM" in peripheral.name: + name_template = lambda i: registers[0].name.replace("x", chr(i+ord('A'))) + elif "CAN" in peripheral.name: + name_template = lambda i: f"F{(i+3)//2}R{(i+1)%2+1}" + elif "GFXMMU" in peripheral.name: + name_template = lambda i: f"LUT{i}{registers[0].name[-1]}" + elif "HSEM" in peripheral.name: + name_template = lambda i: f"{registers[0].name[:-1]}{i+1}" + elif len(registers) == 1: + # if "x=" in registers[0].name: + # name_template = lambda i: f"{registers[0].name.split('x=')[0]}.{i}" + if "x" in registers[0].name: + name_template = lambda i: registers[0].name.replace("x", str(i)) + else: + name_template = lambda i: f"{registers[0].name}.{i}" + else: + print(f"Unknown expansion pattern for {registers}!") + return False + + for ii, offset in offsets: + nreg = Register(name_template(ii), offset, filters=registers[0].filters, parent=peripheral) + nreg.children = [BitField(f.name, f.position, f.width) for f in fields] + for register in registers: + register.parent = None + + return True + + +def _link_instance_to_type(ds, peripheral_types, instance_offsets): + cap_replace = {} + peripherals = set() + for caption, locations in ds.peripherals.items(): + filters = defaultdict(set) + devices = set() + for pfilter in re.findall(r"STM32[\w/]+", html_replace(caption, **cap_replace)): + devices.update(split_device_filter(pfilter) if "/" in pfilter else [pfilter]) + if "Low and medium-density device" in caption: + devices.add("STM32F10..[468B]") + elif "High-density device" in caption: + devices.add("STM32F10..[CDE]") + if devices: + filters["devices"].update(d.replace("x", ".") for d in devices) + + for (names, amin, amax, bus, sections) in locations: + for name in names: + ptypes = [t for tname, types in peripheral_types.items() for t in types if tname == name] + if not ptypes: + ptypes = [t for tname, types in peripheral_types.items() for t in types if tname in name] + if not ptypes: + ptypes = [t for tname, types in peripheral_types.items() + for t in types if t.section in sections] + if not ptypes and name.startswith("UART"): + ptypes = [t for tname, types in peripheral_types.items() for t in types if tname == "USART"] + if not ptypes and "BKP" == name: + ptypes = [t for tname, types in peripheral_types.items() for t in types if tname == "RTC"] + if not ptypes: + print(f"Cannot find peripheral type for instance {name} in section {sections}!") + nsections = list(sorted({t.section for types in peripheral_types.values() for t in types})) + print(f"Available sections are {nsections}.") + exit(1) + offsets = [v for k, v in instance_offsets.items() if re.search(k, name)] + if offsets: amin += offsets[0] + p = Peripheral(name, ptypes, amin, filters=dict(filters), sections=sections) + peripherals.add(p) + return peripherals + + +def _resolve_filters(filters, **kw): + keys = [] + for key, value in kw.items(): + if values := filters.get(key): + keys.append(key) + if any(re.search(pat, value, flags=re.IGNORECASE) for pat in values): + return True + return not keys + + +def _normalize_instances(memtree, peripherals, device): + for peripheral in peripherals: + if not _resolve_filters(peripheral.filters, devices=device.string): + continue + ptypes = peripheral.type + if len(ptypes) > 1: + ptypes = [ptype for ptype in sorted(peripheral.type, key=lambda p: -len(p.filters)) + if _resolve_filters(ptype.filters, instances=peripheral.name, devices=device.string)] + if len(ptypes) > 1 and any(p.filters for p in ptypes): + ptypes = [p for p in ptypes if p.filters] + if len(ptypes) > 1: + nptypes = [p for p in ptypes if any(p.section.startswith(per) or per.startswith(p.section) + for per in peripheral.sections)] + if nptypes: ptypes = nptypes + for pname in ["DMAMUX", "BDMA", "OCTOSPI"]: + if len(ptypes) > 1 and pname in peripheral.name: + ptypes = [p for p in ptypes if pname in p.name] + + if len(ptypes) != 1: + print(f"Unknown peripheral type {device} {peripheral} {ptypes}!") + continue + ptype = ptypes[0] + + nper = Peripheral(peripheral.name, ptype, peripheral.address, + filters=peripheral.filters, parent=memtree) + rmap = defaultdict(list) + for treg in ptype.children: + rmap[treg.name].append(treg) + + for name, tregs in rmap.items(): + regs = [reg for reg in sorted(tregs, key=lambda p: -len(p.filters)) + if _resolve_filters(reg.filters, instances=peripheral.name, devices=device.string)] + if len(regs) > 1 and any(r.filters for r in regs): + regs = [r for r in regs if r.filters] + if len(regs) != 1: + if len(regs) > 1: + print(f"Unsuccessful register filtering {peripheral.name} {device}: {tregs}!") + continue + treg = regs[0] + if _resolve_filters(treg.filters, devices=device.string, instances=nper.name): + preg = Register(treg.name, offset=treg.offset, width=treg.width, + filters=treg.filters, parent=nper) + for tbit in treg.children: + BitField(tbit.name, tbit.position, tbit.width, parent=preg) + + +def _build_device_trees(ds, peripheral_types, instance_offsets): + devices = ds.filter_devices(modm_device_list()) + memtrees = [] + + for device in devices: + memtree = Device(device) + peripherals = _link_instance_to_type(ds, peripheral_types, instance_offsets) + _normalize_instances(memtree, peripherals, device) + memtrees.append(memtree) + return memtrees + + +def _compactify_device_trees(memtrees): + memtree_hashes = defaultdict(list) + for memtree in memtrees: + memtree_hashes[hash(memtree)].append(memtree) + + new_memtrees = [] + for memtrees in memtree_hashes.values(): + memtree = memtrees[0] + for mtree in memtrees[1:]: + memtree.compatible.extend(mtree.compatible) + memtree.compatible.sort(key=lambda d: d.string) + memtree.name = memtree.compatible[0] + new_memtrees.append(memtree) + + return new_memtrees + + +def memory_map_from_datasheet(ds): + register = ds.chapter(r"chapter +\d+ +register +mapping") + table = register.tables("register")[0] + print(table) + registers = {} + for row in table.cell_rows(): + cname = row.match_value("name")[0].text() + ctype = row.match_value("type")[0].text() + caddr = row.match_value(r"address.*?hex")[0].text() + cvalue = row.match_value(r"default")[0].text() + ccomment = row.match_value(r"comment")[0].text() + if not ctype: continue + cvalue = int(cvalue, 2) if cvalue.isdigit() else None + print(cname, ctype, int(caddr, 16), cvalue, ccomment) + + + + + + exit(1) + + peripheral_types = defaultdict(set) + instance_offsets = {} + for chapter in all_chapters: + print() + peripheral_maps, peripheral_offsets = ds.peripheral_maps(chapter, assert_table=chapter in type_chapters) + instance_offsets.update(peripheral_offsets) + peripheral_maps = _peripheral_map_to_tree(chapter, peripheral_maps) + if not _expand_register_offsets(peripheral_maps): + exit(1) + for pmap in peripheral_maps: + print(pmap) + # print(RenderTree(pmap, maxlevel=2)) + peripheral_types[pmap.name].add(pmap) + + for name, pmaps in peripheral_types.items(): + print(name) + for pmap in pmaps: + print(pmap.section, pmap._chapter._relpath) + print(RenderTree(pmap, maxlevel=2)) + + + memtrees = _build_device_trees(ds, peripheral_types, instance_offsets) + # for tree in memtrees: + # print(RenderTree(tree, maxlevel=2)) + # exit(1) + memtrees = _compactify_device_trees(memtrees) + memtrees = [_normalize_order(memtree) for memtree in memtrees] + return memtrees diff --git a/src/modm_data/pdf/__init__.py b/src/modm_data/pdf/__init__.py index ed2f441..aa8d6b4 100644 --- a/src/modm_data/pdf/__init__.py +++ b/src/modm_data/pdf/__init__.py @@ -16,5 +16,6 @@ from .page import Page from .character import Character from .link import ObjLink, WebLink -from .graphics import Path, Image +from .path import Path +from .image import Image from .render import render_page_pdf diff --git a/src/modm_data/pdf/document.py b/src/modm_data/pdf/document.py index 00c7c4d..58917af 100644 --- a/src/modm_data/pdf/document.py +++ b/src/modm_data/pdf/document.py @@ -21,7 +21,7 @@ from collections import defaultdict from .page import Page -LOGGER = logging.getLogger(__name__) +_LOGGER = logging.getLogger(__name__) # We cannot monkey patch this class, since it's a named tuple. :-( @@ -48,11 +48,11 @@ def __init__(self, path: Path, autoclose: bool = False): """ path = Path(path) self.name: str = path.stem - super().__init__(path, autoclose=autoclose) """Stem of the document file name""" + super().__init__(path, autoclose=autoclose) self._path = path self._bbox_cache = defaultdict(dict) - LOGGER.debug(f"Loading: {path}") + _LOGGER.debug(f"Loading: {path}") @cached_property def metadata(self) -> dict[str, str]: @@ -84,7 +84,7 @@ def toc(self) -> list[pp.PdfOutlineItem]: outline = _OutlineItem(toc.level, toc.title, toc.is_closed, toc.n_kids, toc.page_index or last_page_index, toc.view_mode, toc.view_pos) - last_page_index = toc.page_index + last_page_index = toc.page_index or last_page_index tocs.add(outline) return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title))) diff --git a/src/modm_data/pdf/image.py b/src/modm_data/pdf/image.py new file mode 100644 index 0000000..24a4041 --- /dev/null +++ b/src/modm_data/pdf/image.py @@ -0,0 +1,86 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +""" +# PDF Images + +Images support bitmap data. +""" + +from functools import cached_property +import pypdfium2 as pp +from ..utils import Point, Rectangle, Line + + +class Image(pp.PdfImage): + """ + This class extends `pypdfium2.PdfImage` to align it with the interface of + the `Path` class so that it can be used in the same + algorithms without filtering. + + You must construct the images by calling `modm_data.pdf.page.Page.images`. + + .. note:: Images are currently ignored. + """ + # Overwrite the PdfPageObject.__new__ function + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, obj): + """ + :param obj: Page object of the image. + """ + super().__init__(obj.raw, obj.page, obj.pdf, obj.level) + assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE + self.type = pp.raw.FPDF_PAGEOBJ_IMAGE + + self.count: int = 4 + """Number of segments. Always 4 due to rectangular image form. + (For compatibility with `Path.count`.)""" + self.stroke: int = 0 + """The border stroke color. Always 0. + (For compatibility with `Path.stroke`.)""" + self.fill: int = 0 + """The image fill color. Always 0. + (For compatibility with `Path.fill`.)""" + self.width: float = 0 + """The border line width. Always 0. + (For compatibility with `Path.width`.)""" + + @cached_property + def matrix(self) -> pp.PdfMatrix: + """The transformation matrix.""" + return self.get_matrix() + + @cached_property + def bbox(self) -> Rectangle: + """The bounding box of the image.""" + bbox = Rectangle(*self.get_pos()) + if self.page.rotation: + bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, + bbox.p1.y, self.page.height - bbox.p0.x) + return bbox + + @cached_property + def points(self) -> list[Point]: + """ + The 4 points of the bounding box. + (For compatibility with `Path.points`.) + """ + points = self.bbox.points + if self.page.rotation: + points = [Point(p.y, self.page.height - p.x, p.type) for p in points] + return points + + @cached_property + def lines(self) -> list[Line]: + """ + The 4 lines of the bounding box. + (For compatibility with `Path.lines`.) + """ + p = self.points + return [Line(p[0], p[1], p[1].type, 0), Line(p[1], p[2], p[2].type, 0), + Line(p[2], p[3], p[3].type, 0), Line(p[3], p[0], p[0].type, 0)] + + def __repr__(self) -> str: + return f"I{self.bbox}" diff --git a/src/modm_data/pdf/page.py b/src/modm_data/pdf/page.py index 3d86f1d..2beb50c 100644 --- a/src/modm_data/pdf/page.py +++ b/src/modm_data/pdf/page.py @@ -19,10 +19,11 @@ from ..utils import Rectangle, Region from .character import Character from .link import ObjLink, WebLink -from .graphics import Path, Image +from .path import Path +from .image import Image from .structure import Structure -LOGGER = logging.getLogger(__name__) +_LOGGER = logging.getLogger(__name__) class Page(pp.PdfPage): @@ -46,7 +47,7 @@ def __init__(self, document: "modm_data.pdf.Document", index: int): self._weblinks = None self._linked = False - LOGGER.debug(f"Loading: {index}") + _LOGGER.debug(f"Loading: {index}") self._text = self.get_textpage() self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text) @@ -177,9 +178,8 @@ def images(self) -> list[Image]: """All images.""" return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])] - def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None, - absolute_tolerance: float = None) -> \ - list[tuple[Rectangle, list[Path]]]: + def graphic_clusters(self, predicate: Callable[[Path | Image], bool] = None, + absolute_tolerance: float = None) -> list[tuple[Rectangle, list[Path]]]: if absolute_tolerance is None: absolute_tolerance = min(self.width, self.height) * 0.01 @@ -287,4 +287,4 @@ def _key(char): bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin) char._bbox = bbox elif char.unicode not in {0x20, 0xa, 0xd}: - LOGGER.debug(f"Unable to fix bbox for {char.descr()}!") + _LOGGER.debug(f"Unable to fix bbox for {char.descr()}!") diff --git a/src/modm_data/pdf/graphics.py b/src/modm_data/pdf/path.py similarity index 66% rename from src/modm_data/pdf/graphics.py rename to src/modm_data/pdf/path.py index aca3f32..bf59f28 100644 --- a/src/modm_data/pdf/graphics.py +++ b/src/modm_data/pdf/path.py @@ -7,8 +7,6 @@ PDF uses a subset of the PostScript graphics language, which draws vector paths with various rendering options. We are only interested in the basic properties, in particular, for recognizing table cell borders. - -In addition, images support bitmap data. """ import ctypes @@ -148,77 +146,3 @@ def lines(self) -> list[Line]: def __repr__(self) -> str: points = ",".join(repr(p) for p in self.points) return f"P{self.count}={points}" - - -class Image(pp.PdfImage): - """ - This class extends `pypdfium2.PdfImage` to align it with the interface of - the `Path` class so that it can be used in the same - algorithms without filtering. - - You must construct the images by calling `modm_data.pdf.page.Page.images`. - - .. note:: Images are currently ignored. - """ - # Overwrite the PdfPageObject.__new__ function - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, obj): - """ - :param obj: Page object of the image. - """ - super().__init__(obj.raw, obj.page, obj.pdf, obj.level) - assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE - self.type = pp.raw.FPDF_PAGEOBJ_IMAGE - - self.count: int = 4 - """Number of segments. Always 4 due to rectangular image form. - (For compatibility with `Path.count`.)""" - self.stroke: int = 0 - """The border stroke color. Always 0. - (For compatibility with `Path.stroke`.)""" - self.fill: int = 0 - """The image fill color. Always 0. - (For compatibility with `Path.fill`.)""" - self.width: float = 0 - """The border line width. Always 0. - (For compatibility with `Path.width`.)""" - - @cached_property - def matrix(self) -> pp.PdfMatrix: - """The transformation matrix.""" - return self.get_matrix() - - @cached_property - def bbox(self) -> Rectangle: - """The bounding box of the image.""" - bbox = Rectangle(*self.get_pos()) - if self.page.rotation: - bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x, - bbox.p1.y, self.page.height - bbox.p0.x) - return bbox - - @cached_property - def points(self) -> list[Point]: - """ - The 4 points of the bounding box. - (For compatibility with `Path.points`.) - """ - points = self.bbox.points - if self.page.rotation: - points = [Point(p.y, self.page.height - p.x, p.type) for p in points] - return points - - @cached_property - def lines(self) -> list[Line]: - """ - The 4 lines of the bounding box. - (For compatibility with `Path.lines`.) - """ - p = self.points - return [Line(p[0], p[1], p[1].type, 0), Line(p[1], p[2], p[2].type, 0), - Line(p[2], p[3], p[3].type, 0), Line(p[3], p[0], p[0].type, 0)] - - def __repr__(self) -> str: - return f"I{self.bbox}" diff --git a/src/modm_data/pdf2html/__init__.py b/src/modm_data/pdf2html/__init__.py index bf28123..c272980 100644 --- a/src/modm_data/pdf2html/__init__.py +++ b/src/modm_data/pdf2html/__init__.py @@ -7,5 +7,5 @@ from . import stmicro from .render import render_page_pdf -from .line import CharCluster, CharLine -from .figure import Figure +from .convert import convert, patch +from .html import format_document, write_html diff --git a/src/modm_data/pdf2html/stmicro/ast.py b/src/modm_data/pdf2html/ast.py similarity index 51% rename from src/modm_data/pdf2html/stmicro/ast.py rename to src/modm_data/pdf2html/ast.py index 226c0c9..ee252c4 100644 --- a/src/modm_data/pdf2html/stmicro/ast.py +++ b/src/modm_data/pdf2html/ast.py @@ -2,17 +2,16 @@ # SPDX-License-Identifier: MPL-2.0 import logging -from lxml import etree import anytree -from anytree import RenderTree +from anytree import RenderTree, Node from collections import defaultdict -from ...utils import list_strip, Rectangle, ReversePreOrderIter +from ..utils import Rectangle, ReversePreOrderIter from .table import VirtualTable, TableCell -LOGGER = logging.getLogger(__name__) +_LOGGER = logging.getLogger(__name__) -def _normalize_area(area): +def _normalize_area(area: Node) -> Node: for child in ReversePreOrderIter(area): if child.name.startswith("list"): # We need to normalize the xpos back to the first character @@ -24,13 +23,13 @@ def _normalize_area(area): return area -def merge_area(document, area, debug=False): +def merge_area(document: Node, area: Node, debug: bool = False) -> Node: if document is None: - document = anytree.Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None) + document = Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None) document._end = document if not area.children: return document - if debug: print() + if debug: _LOGGER.debug() def _find_end(node): # Find the last leaf node but skip lines, paragraphs, captions/tables/figures @@ -43,7 +42,7 @@ def _find_ancestor(filter_): if filter_(c)), document.root) area = _normalize_area(area) - if debug: print(RenderTree(area)) + if debug: _LOGGER.debug(RenderTree(area)) children = area.children # All area nodes up to the next top-level element must now be # xpos-aligned with the previous area's last leaf node @@ -51,7 +50,7 @@ def _find_ancestor(filter_): if c.name.startswith("head")), len(children)) x_em = area.page._spacing["x_em"] - if debug: print("area=", area, "connect_index=", connect_index) + if debug: _LOGGER.debug("area=", area, "connect_index=", connect_index) # Align these children with the last leaf node xpos for child in children[:connect_index]: if any(child.name.startswith(name) for name in {"list"}): @@ -68,10 +67,10 @@ def _find_ancestor(filter_): child.parent = host document._end = _find_end(document) if debug: - print("child=", child) - print("host=", host) - print("end=", document._end) - print() + _LOGGER.debug(f"{child=}", ) + _LOGGER.debug(f"{host=}") + _LOGGER.debug(f"end={document._end}") + _LOGGER.debug() # Add the remaining top-level children to connect index node if connect_index < len(children): @@ -82,19 +81,19 @@ def _find_ancestor(filter_): document._end = _find_end(document) if debug: - print() - print() + _LOGGER.debug() + _LOGGER.debug() return document -def _normalize_lists(node): +def normalize_lists(node: Node) -> Node: lists = [] current = [] current_name = None for child in node.children: # Normalize the lists from the leaves up - _normalize_lists(child) + normalize_lists(child) # then split the children based on their names if current_name is None or child.name == current_name: current.append(child) @@ -110,7 +109,7 @@ def _normalize_lists(node): for llist in lists: # Insert a new list group node and redirect all children to it if llist[0].name.startswith("list"): - nlist = anytree.Node(llist[0].name, obj=llist[0].obj, + nlist = Node(llist[0].name, obj=llist[0].obj, start=llist[0].value, xpos=llist[0].xpos) for lnode in llist: lnode.name = "element" @@ -125,7 +124,7 @@ def _normalize_lists(node): return node -def _normalize_paragraphs(document): +def normalize_paragraphs(document: Node) -> Node: paras = anytree.search.findall(document, filter_=lambda n: n.name == "para") parents = set(p.parent for p in paras if p.parent.name in {"element", "caption", "document", "cell"}) for parent in parents: @@ -144,17 +143,17 @@ def _normalize_paragraphs(document): return document -def _normalize_lines(document): +def normalize_lines(document: Node) -> Node: paras = anytree.search.findall(document, filter_=lambda n: n.name == "para") for para in paras: - text = anytree.Node("text") + text = Node("text") for line in para.children: line.parent = text para.children = [text] return document -def _normalize_captions(document): +def normalize_captions(document: Node) -> Node: captions = anytree.search.findall(document, filter_=lambda n: n.name == "caption") for caption in captions: cindex = caption.parent.children.index(caption) @@ -165,12 +164,12 @@ def _normalize_captions(document): sibling.number = caption.number break else: - LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}") + _LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}") caption.parent = None return document -def _normalize_headings(document): +def normalize_headings(document: Node) -> Node: headings = anytree.search.findall(document, filter_=lambda n: n.name.startswith("head")) for heading in headings: para = heading.children[0] @@ -185,7 +184,7 @@ def _normalize_headings(document): return document -def _normalize_registers(document): +def normalize_registers(document: Node) -> Node: bits_list = [] sections = anytree.search.findall(document, filter_=lambda n: n.name == "section") for section in (sections + (document,)): @@ -195,7 +194,7 @@ def _normalize_registers(document): if child.name == "bit": # Insert a new bits group node and redirect all children to it if bits is None or bits._page != child._page: - bits = anytree.Node("table", xpos=child.xpos, obj=None, + bits = Node("table", xpos=child.xpos, obj=None, _type="bits", _width=1, _page=child._page) new_children.append(bits) bits_list.append(bits) @@ -229,7 +228,7 @@ def _normalize_registers(document): return document -def _normalize_tables(document): +def normalize_tables(document: Node) -> Node: content_tables = defaultdict(list) register_tables = [] bits_tables = [] @@ -298,7 +297,7 @@ def _push(): return document -def _normalize_chapters(document) -> list: +def normalize_chapters(document: Node) -> Node: headings = anytree.search.findall(document, filter_=lambda n: n.name in ["head1", "head2"], maxlevel=3) idxs = [document.children.index(h.parent) for h in headings] + [len(document.children)] if idxs[0] != 0: @@ -321,300 +320,8 @@ def _normalize_chapters(document) -> list: chapters.append( (chapter_name, filename, document.children[idx0:idx1 + 1]) ) for title, filename, nodes in chapters: - chapter = anytree.Node("chapter", title=title, _filename=filename, parent=document) + chapter = Node("chapter", title=title, _filename=filename, parent=document) for node in nodes: node.parent = chapter return document - - -def normalize_document(document): - def _debug(func, indata, debug=0): - print(func.__name__[1:]) - if debug == -1: - print(RenderTree(indata)) - print() - outdata = func(indata) - if debug == 1: - print(RenderTree(outdata)) - print() - return outdata - - document = _debug(_normalize_lines, document) - document = _debug(_normalize_captions, document) - document = _debug(_normalize_lists, document) - document = _debug(_normalize_paragraphs, document) - document = _debug(_normalize_headings, document) - document = _debug(_normalize_registers, document) - document = _debug(_normalize_tables, document) - # document = _debug(_normalize_chapters, document) - return document - - -def _format_html_figure(xmlnode, figurenode): - tnode = etree.Element("table") - tnode.set("width", f"{int(figurenode._width * 50)}%") - xmlnode.append(tnode) - - captionnode = next((c for c in figurenode.children if c.name == "caption"), None) - if captionnode is not None: - tnode.set("id", f"figure{captionnode.number}") - caption = etree.Element("caption") - tnode.append(caption) - _format_html(caption, captionnode, with_newlines=True) - - ynode = etree.Element("tr") - tnode.append(ynode) - - xynode = etree.Element("td") - ynode.append(xynode) - xynode.text = "(omitted)" - - -def _format_html_table(xmlnode, tablenode): - tnode = etree.Element("table") - xmlnode.append(tnode) - # Format the caption - captionnode = next((c for c in tablenode.children if c.name == "caption"), None) - if captionnode is not None: - tnode.set("id", f"table{captionnode.number}") - caption = etree.Element("caption") - tnode.append(caption) - _format_html(caption, captionnode, with_newlines=True) - if tablenode.obj._type == "register": - tnode.set("class", "rt") - if tablenode.obj._type == "bitfield": - tnode.set("class", "bt") - - # Cells are ordered (y, x) positions - ypos = -1 - ynode = None - header_rows = tablenode.obj.header_rows - for cell in tablenode.obj.cells: - # Add another row to the table - if ypos != cell.y or ynode is None: - ypos = cell.y - ynode = etree.Element("tr") - tnode.append(ynode) - - # Add the right cell with spans and style - xynodespan = xynode = etree.Element("th" if cell.is_header else "td") - ynode.append(xynode) - if cell.xspan > 1: - xynode.set("colspan", str(cell.xspan)) - if cell.yspan > 1: - xynode.set("rowspan", str(cell.yspan)) - if not cell.rotation and tablenode.obj._type != "register" and cell.left_aligned: - xynode.set("class", "tl") - if cell.rotation: - xynodespan = etree.Element("span") - xynodespan.set("class", "tv") - xynode.append(xynodespan) - if (cell.y + cell.yspan) == header_rows: - if cl := xynode.get("class"): - xynode.set("class", "thb " + cl) - else: - xynode.set("class", "thb") - - if cell._is_simple: - xynodespan.text = cell.content.strip() - else: - cell_doc = anytree.Node("document", _page=cell.ast.page) - cell.ast.parent = cell_doc - cell_doc = _normalize_lines(cell_doc) - cell_doc = _normalize_lists(cell_doc) - cell_doc = _normalize_paragraphs(cell_doc) - # print(RenderTree(cell_doc)) - _format_html(xynodespan, cell_doc, with_newlines=True, - ignore_formatting={"bold"} if cell.is_header else None) - - -def _format_char(node, state, chars, ignore): - NOFMT = { - "superscript": False, - "subscript": False, - "italic": False, - "bold": False, - "underline": False, - } - if state is None: state = NOFMT - char = chars[0] - if char["char"] in {'\r'}: - return (True, node, state) - - # print(node, state, char["char"]) - diffs = {} - for key in NOFMT: - if state[key] != char[key] and key not in ignore: - diffs[key] = char[key] - # if diffs: print(diffs) - if not diffs: - prev_name = node.children[-1].name if node.children else None - # print(node) - if prev_name != "newline" and char["char"] == '\n': - # if not (prev_name == "chars" and node.children[-1].chars[-1] == " "): - anytree.Node("newline", parent=node) - elif prev_name != "chars": - anytree.Node("chars", parent=node, chars=char["char"]) - else: - node.children[-1].chars += char["char"] - return (True, node, state) - else: - disable = [key for key, value in diffs.items() if not value] - if disable: - state[node.name] = False - return (False, node.parent, state) - else: - enable = [key for key, value in diffs.items() if value][0] - fmtnode = anytree.Node(enable, parent=node) - state[enable] = True - return (False, fmtnode, state) - - -def _format_lines(textnode, ignore, with_newlines, with_start): - char_props = textnode.root._page._char_properties - formatn = anytree.Node("format") - chars = [] - for line in textnode.children: - if line.name == "line": - for char in line.obj.chars[0 if with_start else line.start:]: - if not with_newlines and char.unicode in {0xa, 0xd}: - continue - chars.append(char_props(line.obj, char)) - if with_newlines and chars[-1]["char"] not in {'\n'}: - char = char_props(line.obj, line.obj.chars[-1]) - char["char"] = '\n' - chars.append(char) - - chars = list_strip(chars, lambda c: c["char"] in {' ', '\n'}) - state = None - node = formatn - while chars: - popchar, node, state = _format_char(node, state, chars, ignore) - if popchar: chars.pop(0) - return formatn - - -def _format_html_fmt(xmlnode, treenode, tail=False): - CONV = { - "superscript": "sup", - "subscript": "sub", - "italic": "i", - "bold": "b", - "underline": "u", - "newline": "br", - } - # print(xmlnode, treenode) - if treenode.name == "chars": - # print(f"{'tail' if tail else 'text'} char={treenode.chars}") - if tail: - xmlnode.tail = (xmlnode.tail or "") + treenode.chars - else: - xmlnode.text = (xmlnode.text or "") + treenode.chars - return (tail, xmlnode) - else: - # print(f"sub {treenode.name}") - if tail: xmlnode = xmlnode.getparent() - subnode = etree.SubElement(xmlnode, CONV[treenode.name]) - tail = False - iternode = subnode - for child in treenode.children: - tail, iternode = _format_html_fmt(iternode, child, tail) - return (True, subnode) - - -def _format_html_text(xmlnode, treenode, ignore=None, with_newlines=False, with_start=True): - fmttree = _format_lines(treenode, ignore or set(), with_newlines, with_start) - tail = False - fmtnode = xmlnode - for child in fmttree.children: - tail, fmtnode = _format_html_fmt(fmtnode, child, tail) - - # print(RenderTree(fmttree)) - # print(etree.tostring(xmlnode, pretty_print=True).decode("utf-8")) - - -def _format_html(xmlnode, treenode, ignore_formatting=None, - with_newlines=False, with_start=True): - if ignore_formatting is None: - ignore_formatting = set() - # print(xmlnode, treenode.name) - current = xmlnode - if treenode.name.startswith("head"): - current = etree.Element(f"h{treenode.name[4]}") - if treenode.marker: - current.set("id", f"section{treenode.marker}") - xmlnode.append(current) - ignore_formatting = ignore_formatting | {"bold", "italic", "underline"} - - elif treenode.name in {"para"}: - current = etree.Element("p") - xmlnode.append(current) - - elif treenode.name in {"note"}: - current = etree.Element("div") - current.set("class", "nt") - xmlnode.append(current) - - elif treenode.name == "text": - _format_html_text(xmlnode, treenode, ignore_formatting, with_newlines, with_start) - - elif treenode.name == "page": - if not current.get("id"): - current.set("id", f"page{treenode.number}") - print(f"{treenode.number}.", end="", flush=True) - return - - elif treenode.name == "table": - _format_html_table(xmlnode, treenode) - return - - elif treenode.name == "figure": - _format_html_figure(xmlnode, treenode) - return - - elif treenode.name == "bits": - _format_html_bits(xmlnode, treenode) - return - - elif treenode.name.startswith("list"): - if treenode.name[4] in {"b", "s"}: - current = etree.Element("ul") - else: - current = etree.Element("ol") - xmlnode.append(current) - - elif treenode.name == "element": - current = etree.Element("li") - if xmlnode.tag == "ol": - current.set("value", str(treenode.value)) - xmlnode.append(current) - with_start = False - - for child in treenode.children: - _format_html(current, child, ignore_formatting, with_newlines, with_start) - - -def format_document(document): - html = etree.Element("html") - - head = etree.Element("head") - html.append(head) - - link = etree.Element("link") - link.set("rel", "stylesheet") - link.set("href", "../style.css") - head.append(link) - - body = etree.Element("body") - html.append(body) - - _format_html(body, document, with_newlines=True) - - html = etree.ElementTree(html) - return html - - -def write_html(html, path, pretty=True): - with open(path, "wb") as f: - html.write(f, pretty_print=pretty, doctype="") diff --git a/src/modm_data/pdf2html/cell.py b/src/modm_data/pdf2html/cell.py new file mode 100644 index 0000000..2c051eb --- /dev/null +++ b/src/modm_data/pdf2html/cell.py @@ -0,0 +1,125 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +from functools import cached_property +from anytree import Node +from ..utils import Rectangle +from .line import CharLine + + +class TableCell: + class Borders: + """The four borders of a Cell""" + def __init__(self, l, b, r, t): + self.l = l + self.b = b + self.r = r + self.t = t + + def __init__(self, table, position, bbox, borders, is_simple=False): + self._table = table + self._bboxes = [bbox] + self.b = borders + """Borders of the cell""" + self.positions = [position] + """Index positions of the cell""" + self.is_header = False + """Is this cell a header?""" + self._is_simple = is_simple + + def _merge(self, other): + self.positions.extend(other.positions) + self.positions.sort() + self._bboxes.append(other.bbox) + self._invalidate() + + def _move(self, x, y): + self.positions = [(py + y, px + x) for (py, px) in self.positions] + self.positions.sort() + self._invalidate() + + def _expand(self, dx, dy): + ymax, xmax = self.positions[-1] + for yi in range(ymax, ymax + dy + 1): + for xi in range(xmax, xmax + dx + 1): + self.positions.append((yi, xi)) + self.positions.sort() + self._invalidate() + + def _invalidate(self): + for key, value in self.__class__.__dict__.items(): + if isinstance(value, cached_property): + self.__dict__.pop(key, None) + + @cached_property + def x(self) -> int: + """The horizontal position of the cell.""" + return self.positions[0][1] + + @cached_property + def y(self) -> int: + """The vertical position of the cell.""" + return self.positions[0][0] + + @cached_property + def xspan(self) -> int: + """The horizontal span of the cell.""" + return self.positions[-1][1] - self.positions[0][1] + 1 + + @cached_property + def yspan(self) -> int: + """The vertical span of the cell.""" + return self.positions[-1][0] - self.positions[0][0] + 1 + + @cached_property + def rotation(self) -> int: + """The rotation of the cell text.""" + if not self.lines: return 0 + return self.lines[0].rotation + + @cached_property + def bbox(self) -> Rectangle: + """The tight bounding box of this cell.""" + return Rectangle(min(bbox.left for bbox in self._bboxes), + min(bbox.bottom for bbox in self._bboxes), + max(bbox.right for bbox in self._bboxes), + max(bbox.top for bbox in self._bboxes)) + + @cached_property + def lines(self) -> list[CharLine]: + """The character lines in this cell.""" + return self._table._page.charlines_in_area(self.bbox) + + @cached_property + def content(self): + """The concatenated text content of the table cell.""" + return "".join(c.char for line in self.lines for c in line.chars) + + @cached_property + def is_left_aligned(self) -> bool: + """Is the text in the cell left aligned?""" + x_em = self._table._page._spacing["x_em"] + for line in self.lines: + if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right): + return True + return False + + @cached_property + def ast(self) -> Node: + """The abstract syntax tree of the cell without graphics.""" + ast = self._table._page.ast_in_area(self.bbox, with_graphics=False, + ignore_xpos=not self.is_left_aligned, + with_bits=False, with_notes=False) + ast.name = "cell" + return ast + + def __repr__(self) -> str: + positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions) + borders = "" + if self.b.l: borders += "[" + if self.b.b: borders += "_" + if self.b.t: borders += "^" + if self.b.r: borders += "]" + start = "CellH" if self.is_header else "Cell" + return start + f"[{positions}] {borders}" + diff --git a/src/modm_data/pdf2html/stmicro/convert.py b/src/modm_data/pdf2html/convert.py similarity index 81% rename from src/modm_data/pdf2html/stmicro/convert.py rename to src/modm_data/pdf2html/convert.py index 1f5ed3b..62504f7 100644 --- a/src/modm_data/pdf2html/stmicro/convert.py +++ b/src/modm_data/pdf2html/convert.py @@ -3,10 +3,11 @@ from anytree import RenderTree -from .ast import merge_area, normalize_document -from .ast import format_document, write_html -from ..render import render_page_pdf -from ...utils import pkg_apply_patch, pkg_file_exists +from .html import format_document, write_html +from .render import render_page_pdf +from ..utils import pkg_apply_patch, pkg_file_exists +from .ast import merge_area +from pathlib import Path import pypdfium2 as pp import subprocess @@ -19,7 +20,7 @@ def convert(doc, page_range, output_path, format_chapters=False, pretty=True, debug_doc = None debug_index = 0 for page in doc.pages(page_range): - if not render_all and any(c in page.top for c in {"Contents", "List of ", "Index"}): + if not render_all and not page.is_relevant: continue print(f"\n\n=== {page.top} #{page.number} ===\n") @@ -50,7 +51,7 @@ def convert(doc, page_range, output_path, format_chapters=False, pretty=True, print("No pages parsed, empty document!") return True - document = normalize_document(document) + document = doc._normalize(document) if show_tree: print(RenderTree(document)) @@ -72,15 +73,14 @@ def convert(doc, page_range, output_path, format_chapters=False, pretty=True, return True -def patch(doc, output_path, patch_file=None) -> bool: +def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool: if patch_file is None: - from . import data # First try the patch file for the specific version patch_file = f"{doc.name}.patch" - if not pkg_file_exists(data, patch_file): + if not pkg_file_exists(data_module, patch_file): # Then try the patch file shared between versions patch_file = f"{doc.name.split('-')[0]}.patch" - if not pkg_file_exists(data, patch_file): + if not pkg_file_exists(data_module, patch_file): return True - return pkg_apply_patch(data, patch_file, output_path) + return pkg_apply_patch(data_module, patch_file, output_path) return apply_patch(patch_file, output_path) diff --git a/src/modm_data/pdf2html/html.py b/src/modm_data/pdf2html/html.py new file mode 100644 index 0000000..8db89a8 --- /dev/null +++ b/src/modm_data/pdf2html/html.py @@ -0,0 +1,279 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +import logging +from lxml import etree +import anytree +from anytree import RenderTree +from ..utils import list_strip +from .ast import normalize_lines, normalize_lists, normalize_paragraphs + +_LOGGER = logging.getLogger(__name__) + +def _format_html_figure(xmlnode, figurenode): + tnode = etree.Element("table") + tnode.set("width", f"{int(figurenode._width * 50)}%") + xmlnode.append(tnode) + + captionnode = next((c for c in figurenode.children if c.name == "caption"), None) + if captionnode is not None: + tnode.set("id", f"figure{captionnode.number}") + caption = etree.Element("caption") + tnode.append(caption) + _format_html(caption, captionnode, with_newlines=True) + + ynode = etree.Element("tr") + tnode.append(ynode) + + xynode = etree.Element("td") + ynode.append(xynode) + xynode.text = "(omitted)" + + +def _format_html_table(xmlnode, tablenode): + tnode = etree.Element("table") + xmlnode.append(tnode) + # Format the caption + captionnode = next((c for c in tablenode.children if c.name == "caption"), None) + if captionnode is not None: + tnode.set("id", f"table{captionnode.number}") + caption = etree.Element("caption") + tnode.append(caption) + _format_html(caption, captionnode, with_newlines=True) + if tablenode.obj._type == "register": + tnode.set("class", "rt") + if tablenode.obj._type == "bitfield": + tnode.set("class", "bt") + + # Cells are ordered (y, x) positions + ypos = -1 + ynode = None + header_rows = tablenode.obj.header_rows + for cell in tablenode.obj.cells: + # Add another row to the table + if ypos != cell.y or ynode is None: + ypos = cell.y + ynode = etree.Element("tr") + tnode.append(ynode) + + # Add the right cell with spans and style + xynodespan = xynode = etree.Element("th" if cell.is_header else "td") + ynode.append(xynode) + if cell.xspan > 1: + xynode.set("colspan", str(cell.xspan)) + if cell.yspan > 1: + xynode.set("rowspan", str(cell.yspan)) + if not cell.rotation and tablenode.obj._type != "register" and cell.is_left_aligned: + xynode.set("class", "tl") + if cell.rotation: + xynodespan = etree.Element("span") + xynodespan.set("class", "tv") + xynode.append(xynodespan) + if (cell.y + cell.yspan) == header_rows: + if cl := xynode.get("class"): + xynode.set("class", "thb " + cl) + else: + xynode.set("class", "thb") + + if cell._is_simple: + xynodespan.text = cell.content.strip() + else: + cell_doc = anytree.Node("document", _page=cell.ast.page) + cell.ast.parent = cell_doc + cell_doc = normalize_lines(cell_doc) + cell_doc = normalize_lists(cell_doc) + cell_doc = normalize_paragraphs(cell_doc) + # _LOGGER.debug(RenderTree(cell_doc)) + _format_html(xynodespan, cell_doc, with_newlines=True, + ignore_formatting={"bold"} if cell.is_header else None) + + +def _format_char(node, state, chars, ignore): + NOFMT = { + "superscript": False, + "subscript": False, + "italic": False, + "bold": False, + "underline": False, + } + if state is None: state = NOFMT + char = chars[0] + if char["char"] in {'\r'}: + return (True, node, state) + + # print(node, state, char["char"]) + diffs = {} + for key in NOFMT: + if state[key] != char[key] and key not in ignore: + diffs[key] = char[key] + # if diffs: print(diffs) + if not diffs: + prev_name = node.children[-1].name if node.children else None + # print(node) + if prev_name != "newline" and char["char"] == '\n': + # if not (prev_name == "chars" and node.children[-1].chars[-1] == " "): + anytree.Node("newline", parent=node) + elif prev_name != "chars": + anytree.Node("chars", parent=node, chars=char["char"]) + else: + node.children[-1].chars += char["char"] + return (True, node, state) + else: + disable = [key for key, value in diffs.items() if not value] + if disable: + state[node.name] = False + return (False, node.parent, state) + else: + enable = [key for key, value in diffs.items() if value][0] + fmtnode = anytree.Node(enable, parent=node) + state[enable] = True + return (False, fmtnode, state) + + +def _format_lines(textnode, ignore, with_newlines, with_start): + char_props = textnode.root._page._char_properties + formatn = anytree.Node("format") + chars = [] + for line in textnode.children: + if line.name == "line": + for char in line.obj.chars[0 if with_start else line.start:]: + if not with_newlines and char.unicode in {0xa, 0xd}: + continue + chars.append(char_props(line.obj, char)) + if with_newlines and chars[-1]["char"] not in {'\n'}: + char = char_props(line.obj, line.obj.chars[-1]) + char["char"] = '\n' + chars.append(char) + + chars = list_strip(chars, lambda c: c["char"] in {' ', '\n'}) + state = None + node = formatn + while chars: + popchar, node, state = _format_char(node, state, chars, ignore) + if popchar: chars.pop(0) + return formatn + + +def _format_html_fmt(xmlnode, treenode, tail=False): + CONV = { + "superscript": "sup", + "subscript": "sub", + "italic": "i", + "bold": "b", + "underline": "u", + "newline": "br", + } + # print(xmlnode, treenode) + if treenode.name == "chars": + # print(f"{'tail' if tail else 'text'} char={treenode.chars}") + if tail: + xmlnode.tail = (xmlnode.tail or "") + treenode.chars + else: + xmlnode.text = (xmlnode.text or "") + treenode.chars + return (tail, xmlnode) + else: + # print(f"sub {treenode.name}") + if tail: xmlnode = xmlnode.getparent() + subnode = etree.SubElement(xmlnode, CONV[treenode.name]) + tail = False + iternode = subnode + for child in treenode.children: + tail, iternode = _format_html_fmt(iternode, child, tail) + return (True, subnode) + + +def _format_html_text(xmlnode, treenode, ignore=None, with_newlines=False, with_start=True): + fmttree = _format_lines(treenode, ignore or set(), with_newlines, with_start) + tail = False + fmtnode = xmlnode + for child in fmttree.children: + tail, fmtnode = _format_html_fmt(fmtnode, child, tail) + + # print(RenderTree(fmttree)) + # print(etree.tostring(xmlnode, pretty_print=True).decode("utf-8")) + + +def _format_html(xmlnode, treenode, ignore_formatting=None, + with_newlines=False, with_start=True): + if ignore_formatting is None: + ignore_formatting = set() + # print(xmlnode, treenode.name) + current = xmlnode + if treenode.name.startswith("head"): + current = etree.Element(f"h{treenode.name[4]}") + if treenode.marker: + current.set("id", f"section{treenode.marker}") + xmlnode.append(current) + ignore_formatting = ignore_formatting | {"bold", "italic", "underline"} + + elif treenode.name in {"para"}: + current = etree.Element("p") + xmlnode.append(current) + + elif treenode.name in {"note"}: + current = etree.Element("div") + current.set("class", "nt") + xmlnode.append(current) + + elif treenode.name == "text": + _format_html_text(xmlnode, treenode, ignore_formatting, with_newlines, with_start) + + elif treenode.name == "page": + if not current.get("id"): + current.set("id", f"page{treenode.number}") + print(f"{treenode.number}.", end="", flush=True) + return + + elif treenode.name == "table": + _format_html_table(xmlnode, treenode) + return + + elif treenode.name == "figure": + _format_html_figure(xmlnode, treenode) + return + + elif treenode.name == "bits": + _format_html_bits(xmlnode, treenode) + return + + elif treenode.name.startswith("list"): + if treenode.name[4] in {"b", "s"}: + current = etree.Element("ul") + else: + current = etree.Element("ol") + xmlnode.append(current) + + elif treenode.name == "element": + current = etree.Element("li") + if xmlnode.tag == "ol": + current.set("value", str(treenode.value)) + xmlnode.append(current) + with_start = False + + for child in treenode.children: + _format_html(current, child, ignore_formatting, with_newlines, with_start) + + +def format_document(document): + html = etree.Element("html") + + head = etree.Element("head") + html.append(head) + + link = etree.Element("link") + link.set("rel", "stylesheet") + link.set("href", "../style.css") + head.append(link) + + body = etree.Element("body") + html.append(body) + + _format_html(body, document, with_newlines=True) + + html = etree.ElementTree(html) + return html + + +def write_html(html, path, pretty=True): + with open(path, "wb") as f: + html.write(f, pretty_print=pretty, doctype="") diff --git a/src/modm_data/pdf2html/line.py b/src/modm_data/pdf2html/line.py index 5b0eb88..31d6e0e 100644 --- a/src/modm_data/pdf2html/line.py +++ b/src/modm_data/pdf2html/line.py @@ -3,6 +3,7 @@ from functools import cached_property from ..utils import Rectangle +from ..pdf import Character class CharCluster: @@ -12,7 +13,7 @@ class CharCluster: character stream of the PDF page. """ - def __init__(self, line, chars: list): + def __init__(self, line: "CharLine", chars: list[Character]): self._line = line self.chars = chars @@ -49,16 +50,19 @@ def __init__(self, page, chars: list, bottom: float, @cached_property def bbox(self) -> Rectangle: + """Bounding box of the character line""" return Rectangle(min(c.bbox.left for c in self.chars), min(c.bbox.bottom for c in self.chars), max(c.bbox.right for c in self.chars), max(c.bbox.top for c in self.chars)) @cached_property - def fonts(self) -> set: + def fonts(self) -> set[str]: + """All font names in this character line""" return set(c.font for c in self.chars if c.font) - def contains_font(self, *fragments) -> bool: + def contains_font(self, *fragments: str) -> bool: + """:return: True if any fragment is part of the font names""" for fragment in fragments: if any(fragment in font for font in self.fonts): return True @@ -66,22 +70,23 @@ def contains_font(self, *fragments) -> bool: @cached_property def content(self) -> str: + """Text contained in the character line""" return "".join(c.char for c in self.chars) - def clusters(self, atol: float = None) -> list[CharCluster]: - # Find clusters of characters in a line incl. whitespace chars + def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]: + """Find clusters of characters in a line separated by `absolute_tolerance`.""" def _cluster(clusters, chars): if chars: clusters.append(CharCluster(self, chars)) # We want to group the chars if the space between them is > 1em - if atol is None: - atol = self._page._spacing["x_em"] * 1 + if absolute_tolerance is None: + absolute_tolerance = self._page._spacing["x_em"] * 1 clusters = [] current_chars = [self.chars[0]] last_char = current_chars[0] for next_char in self.chars[1:]: - if next_char.bbox.left - last_char.bbox.right < atol: + if next_char.bbox.left - last_char.bbox.right < absolute_tolerance: # Keep this char in the current cluster current_chars.append(next_char) if next_char.unicode not in {0x20, 0xa, 0xd}: diff --git a/src/modm_data/pdf2html/page.py b/src/modm_data/pdf2html/page.py new file mode 100644 index 0000000..33f687a --- /dev/null +++ b/src/modm_data/pdf2html/page.py @@ -0,0 +1,380 @@ +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 + +import re +import math +import logging +import textwrap +import statistics +from typing import Callable +from functools import cached_property, cache, reduce +from collections import defaultdict +from .table import Table +from .figure import Figure +from .line import CharLine +from ..utils import HLine, VLine, Rectangle, Region +from ..pdf import Path, Image, Page as PdfPage, Character +from anytree import Node + + +_LOGGER = logging.getLogger(__name__) + + +class Page(PdfPage): + def __init__(self, document, index: int): + super().__init__(document, index) + self._template = "default" + self.is_relevant: bool = True + """Is this page relevant for the conversion?""" + + def _unicode_filter(self, code: int) -> int: + return code + + @cached_property + def _spacing(self) -> dict[str, float]: + content = 0.1 + return { + # Horizontal spacing: left->right + "x_em": 0.01 * self.width, + "x_left": content * self.width, + "x_right": (1 - content) * self.width, + "x_content": 0.2 * self.width, + # Vertical spacing: bottom->top + "y_em": 0.01 * self.height, + # Max table line thickness + "y_tline": 0.005 * self.height, + # Max line height distance to detect paragraphs + "lh": 0.9, + # Max line height distance to detect super-/subscript + "sc": 0.3, + # Table header cell bold text threshold + "th": 0.3, + } + + def _line_size(self, line: CharLine) -> str: + rsize = line.height + if rsize >= 17.5: return "h1" + elif rsize >= 15.5: return "h2" + elif rsize >= 13.5: return "h3" + elif rsize >= 11.4: return "h4" + elif rsize >= 8.5: return "n" + else: return "fn" + + def _colors(self, color: int) -> str: + if 0xff <= color <= 0xff: return "black" + if 0xffffffff <= color <= 0xffffffff: return "white" + return "unknown" + + @cached_property + def _areas(self) -> dict[str, list[Rectangle] | Rectangle]: + content = Rectangle(0.1, 0.1, 0.9, 0.9) + areas = {"content": [content]} + scaled_areas = {} + def _s(r): + return Rectangle(r.left * self.width, r.bottom * self.height, + r.right * self.width, r.top * self.height) + for name, area in areas.items(): + scaled_areas[name] = [_s(r) for r in area] if isinstance(area, list) else _s(area) + return scaled_areas + + def _char_properties(self, line, char): + cp = { + "superscript": False, + "subscript": False, + "bold": any(frag in char.font for frag in {"Bold"}), + "italic": any(frag in char.font for frag in {"Italic", "Oblique"}), + "underline": (char.objlink or char.weblink) is not None, + "size": round(line.height), + "relsize": self._line_size(line), + "char": chr(char.unicode), + } + if line.rotation: + if char.origin.x < (line.origin - 0.25 * line.height): + cp["superscript"] = True + elif char.origin.x > (line.origin + 0.15 * line.height): + cp["subscript"] = True + elif char.origin.y > (line.origin + 0.25 * line.height): + cp["superscript"] = True + elif char.origin.y < (line.origin - 0.15 * line.height): + cp["subscript"] = True + return cp + + def text_in_named_area(self, name: str, check_length: bool = True) -> str | None: + """ + Find all text in the named area. + + :param name: the name of the area(s) to query. + :param check_length: assert that the text has a length. + :return: the concatenated text of the named area(s) or `None` if area not found. + """ + if name not in self._areas: return None + text = "" + areas = self._areas[name] + if not isinstance(areas, list): areas = [areas] + for area in areas: text += self.text_in_area(area) + if check_length: assert text + return text + + def charlines_in_area(self, area: Rectangle, + predicate: Callable[[Character], bool] = None, + rtol: float = None) -> list[CharLine]: + """ + Coalesce the characters in the area and predicate into lines. + + 1. Every character in the area is filtered by the `predicate`. + 2. Character orientation is split into horizontal (left->right) and + vertical (bottom->top) character lines sorted by x or y position. + Lines containing only whitespace are discarded. + 3. Overlapping character lines are merged into sub- and superscript + using `rtol * max(current_line.height, next_line.height)` as the + tolerance for checking if the lines overlap. + 4. The characters in the merged lines are re-sorted by origin. + + :param area: Area to search for characters. + :param predicate: Function to discard characters in the area or include all by default. + :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default. + :return: A list of character lines sorted by x or y position. + """ + if rtol is None: rtol = self._spacing["sc"] + # Split all chars into lines based on rounded origin + origin_lines_y = defaultdict(list) + origin_lines_x = defaultdict(list) + for char in self.chars_in_area(area): + # Ignore all characters we don't want + if predicate is not None and not predicate(char): + continue + cunicode = self._unicode_filter(char.unicode) + if cunicode is None: continue + char.unicode = cunicode + if char.unicode < 32 and char.unicode not in {0xa}: + continue + # Ignore characters without width that are not spaces + if not char.width and char.unicode not in {0xa, 0xd, 0x20}: + _LOGGER.error(f"Unknown char width for {char}: {char.bbox}") + # Split up the chars depending on the orientation + if 45 < char.rotation <= 135 or 225 < char.rotation <= 315: + origin_lines_x[round(char.origin.x, 1)].append(char) + elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation: + origin_lines_y[round(char.origin.y, 1)].append(char) + else: + _LOGGER.error("Unknown char rotation:", char, char.rotation) + + # Convert characters into lines + bbox_lines_y = [] + for chars in origin_lines_y.values(): + # Remove lines with whitespace only + if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): + continue + origin = statistics.fmean(c.origin.y for c in chars) + line = CharLine(self, chars, + min(c.bbox.bottom for c in chars), + origin, + max(c.bbox.top for c in chars), + max(c.height for c in chars), + sort_origin=self.height - origin) + bbox_lines_y.append(line) + # print(line, line.top, line.origin, line.bottom, line.height) + bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin) + + bbox_lines_x = [] + for chars in origin_lines_x.values(): + # Remove lines with whitespace only + if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): + continue + line = CharLine(self, chars, + min(c.bbox.left for c in chars), + statistics.fmean(c.origin.x for c in chars), + max(c.bbox.right for c in chars), + max(c.width for c in chars), + 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90) + bbox_lines_x.append(line) + bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin) + + if not bbox_lines: + return [] + + # Merge lines that have overlapping bbox_lines + # FIXME: This merges lines that "collide" vertically like in formulas + merged_lines = [] + current_line = bbox_lines[0] + for next_line in bbox_lines[1:]: + height = max(current_line.height, next_line.height) + # Calculate overlap via normalize origin (increasing with line index) + if ((current_line._sort_origin + rtol * height) > + (next_line._sort_origin - rtol * height)): + # if line.rotation or self.rotation: + # # The next line overlaps this one, we merge the shorter line + # # (typically super- and subscript) into taller line + # use_current = len(current_line.chars) >= len(next_line.chars) + # else: + use_current = current_line.height >= next_line.height + line = current_line if use_current else next_line + current_line = CharLine(self, current_line.chars + next_line.chars, + line.bottom, line.origin, line.top, + height, line.rotation, + sort_origin=line._sort_origin) + else: + # The next line does not overlap the current line + merged_lines.append(current_line) + current_line = next_line + # append last line + merged_lines.append(current_line) + + # Sort all lines horizontally based on character origin + sorted_lines = [] + for line in merged_lines: + if line.rotation == 90: + def sort_key(char): + if char.unicode in {0xa, 0xd}: + return char.tbbox.midpoint.y - 1e9 + return char.tbbox.midpoint.y + elif line.rotation == 270: + def sort_key(char): + if char.unicode in {0xa, 0xd}: + return -char.tbbox.midpoint.y + 1e9 + return -char.tbbox.midpoint.y + else: + def sort_key(char): + if char.unicode in {0xa, 0xd}: + return char.origin.x + 1e9 + return char.origin.x + sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key), + line.bottom, line.origin, + line.top, line.height, + line.rotation, area.left, + sort_origin=line._sort_origin)) + + return sorted_lines + + def graphic_bboxes_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[tuple[Rectangle, Table | Figure | None]]: + """ + Coalesce the graphics in the area into full width bounding boxes. + + 1. Group vertically overlapping graphics. + 2. Widen the overlapped graphics bounding boxes to the edges of the area. + + :param area: area to search for content. + :param with_graphics: search for graphics in the area. + :return: list of tuples (bounding box, graphic objects or `None`). + """ + if with_graphics: + graphics = self.graphics_in_area(area) + regions = [] + # Check if graphics bounding boxes overlap vertically and group them + for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)): + gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox + for reg in regions: + if reg.overlaps(gbbox.bottom, gbbox.top): + # They overlap, so merge them + reg.v0 = min(reg.v0, gbbox.bottom) + reg.v1 = max(reg.v1, gbbox.top) + reg.objs.append(graphic) + break + else: + regions.append(Region(gbbox.bottom, gbbox.top, graphic)) + + # print(regions) + # Coalesce all overlapped graphics objects into full width areas + areas = [] + ypos = area.top + for reg in regions: + if ypos - reg.v1 > self._spacing["y_em"]: + areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None)) + for obj in reg.objs: + oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox + areas.append((oarea, obj)) + ypos = reg.v0 + areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None)) + else: + areas = [(area, None)] + return areas + + def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]: + """ + Find all content objects in this area. + + :param area: area to search for content. + :param with_graphics: search for graphics in the area. + :return: list of content objects sorted top to bottom. + """ + self._link_characters() + areas = self.graphic_bboxes_in_area(area, with_graphics) + objects = [] + for narea, obj in areas: + if obj is None: + objects += self.charlines_in_area(narea) + else: + oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox + predicate = lambda c: not obj.bbox.contains(c.origin) + lines = self.charlines_in_area(oarea, predicate) + # print(obj, oarea, lines, [line.content for line in lines]) + objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x))) + return objects + + def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: + """ + Find all tables and figures in this area. + + :param area: area to search for graphics. + :return: list of tables and figures. + """ + return [] + + def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node: + """ + Convert the area content into an abstract syntax tree. + + :param area: area to search for content. + :param with_graphics: including graphics in the area. + :return: An abstract syntax tree including the content formatting. + """ + return Node("area", obj=area, xpos=int(area.left), page=self) + + @property + def content_ast(self) -> list[Node]: + """The abstract syntax trees in the content area.""" + ast = [] + with_graphics = True + for area in self._areas["content"]: + ast.append(self.ast_in_area(area, with_graphics=with_graphics)) + # Add a page node to the first leaf to keep track of where a page starts + first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) + Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) + return ast + + @property + def content_objects(self) -> list[CharLine | Table | Figure]: + """All objects in the content areas.""" + objs = [] + for area in self._areas["content"]: + objs.extend(self.objects_in_area(area)) + return objs + + @property + def content_graphics(self) -> list[Table | Figure]: + """All graphics in the content areas.""" + objs = [] + for area in self._areas["content"]: + objs.extend(self.graphics_in_area(area)) + return objs + + @property + def content_lines(self) -> list[CharLine]: + """All lines in the content areas.""" + objs = [] + for area in self._areas["content"]: + objs.extend(self.charlines_in_area(area)) + return objs + + @property + def content_tables(self) -> list[Table]: + """All tables in the content areas.""" + return [o for o in self.content_graphics if isinstance(o, Table)] + + @property + def content_figures(self) -> list[Figure]: + """All figures in the content areas.""" + return [o for o in self.content_graphics if isinstance(o, Figure)] + + def __repr__(self) -> str: + return f"Page({self.number})" diff --git a/src/modm_data/pdf2html/render.py b/src/modm_data/pdf2html/render.py index 0717bd4..526eb61 100644 --- a/src/modm_data/pdf2html/render.py +++ b/src/modm_data/pdf2html/render.py @@ -11,7 +11,7 @@ def render_page_pdf(doc, page, new_doc = None, index = 0): """ - Test doc string + :param doc: PDF document :param page: PDF page diff --git a/src/modm_data/pdf2html/stmicro/__init__.py b/src/modm_data/pdf2html/stmicro/__init__.py index 4adcde6..fd9ce58 100644 --- a/src/modm_data/pdf2html/stmicro/__init__.py +++ b/src/modm_data/pdf2html/stmicro/__init__.py @@ -1,7 +1,5 @@ # Copyright 2022, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -from .page import Page, is_compatible -from .ast import normalize_document, merge_area, format_document, write_html -from .convert import convert, patch + from .document import Document diff --git a/src/modm_data/pdf2html/stmicro/__main__.py b/src/modm_data/pdf2html/stmicro/__main__.py index 40d2ef0..208e2f6 100644 --- a/src/modm_data/pdf2html/stmicro/__main__.py +++ b/src/modm_data/pdf2html/stmicro/__main__.py @@ -3,15 +3,16 @@ import re import tqdm +import logging import argparse import subprocess from pathlib import Path from multiprocessing.pool import ThreadPool -import modm_data -from . import convert, patch +from .. import convert, patch def main(): + import modm_data parser = argparse.ArgumentParser() parser.add_argument("--document", type=Path) parser.add_argument("--output", type=str, default="") @@ -25,12 +26,14 @@ def main(): parser.add_argument("--chapters", action="store_true") parser.add_argument("--tags", action="store_true") parser.add_argument("--all", action="store_true") + parser.add_argument("-v", dest="verbose", action="count", default=0) args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) doc = modm_data.pdf2html.stmicro.Document(args.document) - # if doc.page_count == 0 or not doc.page(1).width: - # print("Corrupt PDF!") - # exit(1) + if doc.page_count == 0 or not doc.page(1).width: + print("Corrupt PDF!") + exit(1) if args.page or args.range: page_range = list(map(lambda p: p - 1, args.page or [])) @@ -79,7 +82,8 @@ def main(): for retval, call in zip(retvals, calls): if retval.returncode != 0: print(call) if all(r.returncode == 0 for r in retvals): - return patch(doc, output_dir) + from . import data + return patch(doc, data, output_dir) return False return convert(doc, page_range, output_path, format_chapters=args.chapters, diff --git a/src/modm_data/pdf2html/stmicro/document.py b/src/modm_data/pdf2html/stmicro/document.py index fdecf9b..3931033 100644 --- a/src/modm_data/pdf2html/stmicro/document.py +++ b/src/modm_data/pdf2html/stmicro/document.py @@ -1,13 +1,44 @@ # Copyright 2023, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 +import logging from .page import Page as StmPage from ...pdf import Document as PdfDocument +from ..ast import normalize_lines, normalize_captions, normalize_lists +from ..ast import normalize_paragraphs, normalize_headings, normalize_registers +from ..ast import normalize_tables, normalize_chapters + +_LOGGER = logging.getLogger(__name__) + +def _debug(func, indata, debug=0): + _LOGGER.debug(func.__name__) + if debug == -1: + _LOGGER.debug(RenderTree(indata)) + _LOGGER.debug() + outdata = func(indata) + if debug == 1: + _LOGGER.debug(RenderTree(outdata)) + _LOGGER.debug() + return outdata + + +def _normalize_document(document): + document = _debug(normalize_lines, document) + document = _debug(normalize_captions, document) + document = _debug(normalize_lists, document) + document = _debug(normalize_paragraphs, document) + document = _debug(normalize_headings, document) + document = _debug(normalize_registers, document) + document = _debug(normalize_tables, document) + # document = _debug(normalize_chapters, document) + return document + class Document(PdfDocument): def __init__(self, path: str): super().__init__(path) + self._normalize = _normalize_document def page(self, index: int) -> StmPage: assert index < self.page_count diff --git a/src/modm_data/pdf2html/stmicro/page.py b/src/modm_data/pdf2html/stmicro/page.py index b56b56e..68cec3c 100644 --- a/src/modm_data/pdf2html/stmicro/page.py +++ b/src/modm_data/pdf2html/stmicro/page.py @@ -8,15 +8,17 @@ import statistics from functools import cached_property, cache, reduce from collections import defaultdict -from .table import Table +from ..table import Table from ..figure import Figure from ..line import CharLine from ...utils import HLine, VLine, Rectangle, Region from ...pdf import Path, Image, Page as PdfPage +from ..page import Page as BasePage from anytree import Node -LOGGER = logging.getLogger(__name__) +_LOGGER = logging.getLogger(__name__) + def is_compatible(document) -> bool: if "stmicro" in document.metadata.get("Author", "").lower(): @@ -24,7 +26,7 @@ def is_compatible(document) -> bool: return False -def areas_black_white(page) -> dict: +def _areas_black_white(page) -> dict: def _scale(r): if page.rotation: return Rectangle(r.bottom * page.width, (1 - r.right) * page.height, @@ -94,7 +96,7 @@ def _scale(r): return scaled_areas -def areas_blue_gray(page) -> dict: +def _areas_blue_gray(page) -> dict: def _scale(r): return Rectangle(r.left * page.width, r.bottom * page.height, r.right * page.width, r.top * page.height) @@ -146,7 +148,7 @@ def _scale(r): return scaled_areas -def spacing_black_white(page) -> dict: +def _spacing_black_white(page) -> dict: content = 0.1125 spacing = { # Horizontal spacing: left->right @@ -177,10 +179,10 @@ def spacing_black_white(page) -> dict: "lh": 1.2, "sc": 0.4, }) - return spacing + return spacing | _spacing_special(page) -def spacing_blue_gray(page) -> dict: +def _spacing_blue_gray(page) -> dict: content = 0.07 spacing = { # Horizontal spacing: left->right @@ -210,10 +212,25 @@ def spacing_blue_gray(page) -> dict: "lh": 1.6, "sc": 0.2, }) - return spacing + return spacing | _spacing_special(page) + + +def _spacing_special(page) -> dict: + # Patches to detect the header cells correctly + if ((page.pdf.name == "DS12930-v1" and page.index in range(90, 106)) or + (page.pdf.name == "DS12931-v1" and page.index in range(89, 105))): + return {"th": 0.1} + if ((page.pdf.name == "RM0453-v2" and page.index in [1354]) or + (page.pdf.name == "RM0456-v2" and page.index in [2881]) or + (page.pdf.name == "RM0456-v3" and page.index in [2880]) or + (page.pdf.name == "RM0461-v4" and page.index in [1246])): + return {"th": 0.5} + if ((page.pdf.name == "RM0456-v2" and page.index in [3005])): + return {"th": 0.52} + return {} -def linesize_black_white(line: float) -> str: +def _linesize_black_white(line: CharLine) -> str: rsize = line.height if rsize >= 17.5: return "h1" elif rsize >= 15.5: return "h2" @@ -223,7 +240,7 @@ def linesize_black_white(line: float) -> str: else: return "fn" -def linesize_blue_gray(line: float) -> str: +def _linesize_blue_gray(line: CharLine) -> str: rsize = round(line.height) if rsize >= 16: return "h1" elif rsize >= 14: return "h2" @@ -233,7 +250,7 @@ def linesize_blue_gray(line: float) -> str: else: return "fn" -def colors_black_white(color: int) -> str: +def _colors_black_white(color: int) -> str: if 0xff <= color <= 0xff: return "black" if 0xffffffff <= color <= 0xffffffff: @@ -241,7 +258,7 @@ def colors_black_white(color: int) -> str: return "unknown" -def colors_blue_gray(color: int) -> str: +def _colors_blue_gray(color: int) -> str: if 0xff <= color <= 0xff: return "black" if 0xffffffff <= color <= 0xffffffff: @@ -257,230 +274,53 @@ def colors_blue_gray(color: int) -> str: return "unknown" -class Page(PdfPage): - +class Page(BasePage): def __init__(self, document, index: int): super().__init__(document, index) - self._template = "black_white" producer = self.pdf.metadata.get("Producer", "").lower() - if "acrobat" in producer: - pass # default + self._template = "black_white" + if "acrobat" in producer or "adobe" in producer: + pass elif "antenna" in producer: self._template = "blue_gray" else: - LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") + _LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'") if "blue_gray" in self._template: - self._areas = areas_blue_gray(self) - self._spacing = spacing_blue_gray(self) - self._colors = colors_blue_gray - self._line_size = linesize_blue_gray + self._areas = _areas_blue_gray(self) + self._spacing = _spacing_blue_gray(self) + self._colors = _colors_blue_gray + self._line_size = _linesize_blue_gray elif "black_white" in self._template: - self._areas = areas_black_white(self) - self._spacing = spacing_black_white(self) - self._colors = colors_black_white - self._line_size = linesize_black_white - - # Patches to detect the header cells correctly - if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or - (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))): - self._spacing["th"] = 0.1 - if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or - (self.pdf.name == "RM0456-v2" and self.index in [2881]) or - (self.pdf.name == "RM0456-v3" and self.index in [2880]) or - (self.pdf.name == "RM0461-v4" and self.index in [1246])): - self._spacing["th"] = 0.5 - if ((self.pdf.name == "RM0456-v2" and self.index in [3005])): - self._spacing["th"] = 0.52 - - def _text_in_area(self, name, check_length=True) -> str: - if name not in self._areas: return "" - text = "" - areas = self._areas[name] - if not isinstance(areas, list): areas = [areas] - for area in areas: - text += self.text_in_area(area) - if check_length: assert text - return text + self._areas = _areas_black_white(self) + self._spacing = _spacing_black_white(self) + self._colors = _colors_black_white + self._line_size = _linesize_black_white + + def _unicode_filter(self, code: int) -> int: + # Ignore Carriage Return characters and ® (superscript issues) + if code in {0xd, ord("®")}: return None + # Correct some weird unicode stuffing choices + if code in {2}: return ord("-") + if code in {61623, 61664}: return ord("•") + return code @cached_property def identifier(self) -> str: - return self._text_in_area("id", check_length=False) + return self.text_in_named_area("id", check_length=False) @cached_property def top(self) -> str: if self.index == 0: return "Cover" - return self._text_in_area("top", check_length=False) + return self.text_in_named_area("top", check_length=False) + @cached_property def is_relevant(self) -> bool: if any(c in self.top for c in {"Contents", "List of ", "Index"}): return False return True - def _charlines_filtered(self, area, predicate = None, rtol = None) -> list[CharLine]: - if rtol is None: rtol = self._spacing["sc"] - # Split all chars into lines based on rounded origin - origin_lines_y = defaultdict(list) - origin_lines_x = defaultdict(list) - for char in self.chars_in_area(area): - # Ignore all characters we don't want - if predicate is not None and not predicate(char): - continue - # Ignore Carriage Return characters and ® (superscript issues) - if char.unicode in {0xd, ord("®")}: - continue - # Correct some weird unicode stuffing choices - if char.unicode in {2}: - char.unicode = ord("-") - if char.unicode in {61623, 61664}: - char.unicode = ord("•") - if char.unicode < 32 and char.unicode not in {0xa}: - continue - # Ignore characters without width that are not spaces - if not char.width and char.unicode not in {0xa, 0xd, 0x20}: - LOGGER.error(f"Unknown char width for {char}: {char.bbox}") - # Split up the chars depending on the orientation - if 45 < char.rotation <= 135 or 225 < char.rotation <= 315: - origin_lines_x[round(char.origin.x, 1)].append(char) - elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation: - origin_lines_y[round(char.origin.y, 1)].append(char) - else: - LOGGER.error("Unknown char rotation:", char, char.rotation) - - # Convert characters into lines - bbox_lines_y = [] - for chars in origin_lines_y.values(): - # Remove lines with whitespace only - if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): - continue - origin = statistics.fmean(c.origin.y for c in chars) - line = CharLine(self, chars, - min(c.bbox.bottom for c in chars), - origin, - max(c.bbox.top for c in chars), - max(c.height for c in chars), - sort_origin=self.height - origin) - bbox_lines_y.append(line) - # print(line, line.top, line.origin, line.bottom, line.height) - bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin) - - bbox_lines_x = [] - for chars in origin_lines_x.values(): - # Remove lines with whitespace only - if all(c.unicode in {0xa, 0xd, 0x20} for c in chars): - continue - line = CharLine(self, chars, - min(c.bbox.left for c in chars), - statistics.fmean(c.origin.x for c in chars), - max(c.bbox.right for c in chars), - max(c.width for c in chars), - 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90) - bbox_lines_x.append(line) - bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin) - - if not bbox_lines: - return [] - - # Merge lines that have overlapping bbox_lines - # FIXME: This merges lines that "collide" vertically like in formulas - merged_lines = [] - current_line = bbox_lines[0] - for next_line in bbox_lines[1:]: - height = max(current_line.height, next_line.height) - # Calculate overlap via normalize origin (increasing with line index) - if ((current_line._sort_origin + rtol * height) > - (next_line._sort_origin - rtol * height)): - # if line.rotation or self.rotation: - # # The next line overlaps this one, we merge the shorter line - # # (typically super- and subscript) into taller line - # use_current = len(current_line.chars) >= len(next_line.chars) - # else: - use_current = current_line.height >= next_line.height - line = current_line if use_current else next_line - current_line = CharLine(self, current_line.chars + next_line.chars, - line.bottom, line.origin, line.top, - height, line.rotation, - sort_origin=line._sort_origin) - else: - # The next line does not overlap the current line - merged_lines.append(current_line) - current_line = next_line - # append last line - merged_lines.append(current_line) - - # Sort all lines horizontally based on character origin - sorted_lines = [] - for line in merged_lines: - if line.rotation == 90: - def sort_key(char): - if char.unicode in {0xa, 0xd}: - return char.tbbox.midpoint.y - 1e9 - return char.tbbox.midpoint.y - elif line.rotation == 270: - def sort_key(char): - if char.unicode in {0xa, 0xd}: - return -char.tbbox.midpoint.y + 1e9 - return -char.tbbox.midpoint.y - else: - def sort_key(char): - if char.unicode in {0xa, 0xd}: - return char.origin.x + 1e9 - return char.origin.x - sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key), - line.bottom, line.origin, - line.top, line.height, - line.rotation, area.left, - sort_origin=line._sort_origin)) - - return sorted_lines - - def _content_areas(self, area: Rectangle, with_graphics: bool = True) -> list: - if with_graphics: - graphics = self._graphics_filtered(area) - regions = [] - for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)): - gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox - for reg in regions: - if reg.overlaps(gbbox.bottom, gbbox.top): - # They overlap, so merge them - reg.v0 = min(reg.v0, gbbox.bottom) - reg.v1 = max(reg.v1, gbbox.top) - reg.objs.append(graphic) - break - else: - regions.append(Region(gbbox.bottom, gbbox.top, graphic)) - - # print(regions) - areas = [] - ypos = area.top - for reg in regions: - if ypos - reg.v1 > self._spacing["y_em"]: - areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None)) - for obj in reg.objs: - oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox - areas.append((oarea, obj)) - ypos = reg.v0 - areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None)) - else: - areas = [(area, None)] - return areas - - def _objects_filtered(self, area: Rectangle, with_graphics: bool = True) -> list: - self._link_characters() - areas = self._content_areas(area, with_graphics) - objects = [] - for narea, obj in areas: - if obj is None: - objects += self._charlines_filtered(narea) - else: - oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox - predicate = lambda c: not obj.bbox.contains(c.origin) - lines = self._charlines_filtered(oarea, predicate) - # print(obj, oarea, lines, [line.content for line in lines]) - objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x))) - return objects - @property def content_ast(self) -> list: ast = [] @@ -492,13 +332,13 @@ def content_ast(self) -> list: re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1) with_graphics = (order_page != self.index) for area in self._areas["content"]: - ast.append(self._ast_filtered(area, with_graphics=with_graphics)) + ast.append(self.ast_in_area(area, with_graphics=with_graphics)) # Add a page node to the first leaf to keep track of where a page starts first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0]) Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number) return ast - def _graphics_filtered(self, area) -> list: + def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]: # Find all graphic clusters in this area em = self._spacing["y_em"] large_area = area.offset_x(em/2) @@ -511,7 +351,7 @@ def _graphics_filtered(self, area) -> list: # Find the captions and group them by y origin to catch side-by-side figures ycaptions = defaultdict(list) - for line in self._charlines_filtered(area, lambda c: "Bold" in c.font): + for line in self.charlines_in_area(area, lambda c: "Bold" in c.font): for cluster in line.clusters(): for phrase in [r"Figure \d+\.", r"Table \d+\."]: if re.match(phrase, cluster.content): @@ -531,7 +371,7 @@ def _graphics_filtered(self, area) -> list: if b.bottom <= bottom and left <= b.left and b.right <= right), None) if graphic is None: - LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}") + _LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}") continue if self._template == "blue_gray": @@ -545,7 +385,7 @@ def _graphics_filtered(self, area) -> list: break cbbox = nbbox cchars = nchars - elif self._template == "black_white": + else: cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top) otype = phrase.split(" ")[0].lower() @@ -583,6 +423,7 @@ def _graphics_filtered(self, area) -> list: for gbbox, paths in graphic_clusters: if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]: continue + category = "" if any(isinstance(p, Image) for p in paths): category = "figure" elif self._template == "blue_gray": @@ -643,9 +484,9 @@ def _graphics_filtered(self, area) -> list: elif line.direction == line.Direction.HORIZONTAL: ylines.append(line.specialize()) else: - LOGGER.warn(f"Line not vertical or horizontal: {line}") + _LOGGER.warn(f"Line not vertical or horizontal: {line}") else: - LOGGER.warn(f"Path too long: {path}") + _LOGGER.warn(f"Path too long: {path}") elif self._colors(path.fill) == "darkblue": # Add the bottom line of the dark blue header box as a very thick line line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5) @@ -681,58 +522,9 @@ def _graphics_filtered(self, area) -> list: return objects - @property - def content_objects(self) -> list: - objs = [] - for area in self._areas["content"]: - objs.extend(self._objects_filtered(area)) - return objs - - @property - def content_graphics(self) -> list: - objs = [] - for area in self._areas["content"]: - objs.extend(self._graphics_filtered(area)) - return objs - - @property - def content_lines(self) -> list: - return [o for o in self.content_objects if isinstance(o, CharLine)] - - @property - def content_tables(self) -> list: - return [o for o in self.content_graphics if isinstance(o, Table)] - - @property - def content_figures(self) -> list: - return [o for o in self.content_graphics if isinstance(o, Figure)] - - def _char_properties(self, line, char): - cp = { - "superscript": False, - "subscript": False, - "bold": any(frag in char.font for frag in {"Bold"}), - "italic": any(frag in char.font for frag in {"Italic", "Oblique"}), - "underline": (char.objlink or char.weblink) is not None, - "size": round(line.height), - "relsize": self._line_size(line), - "char": chr(char.unicode), - } - - if line.rotation: - if char.origin.x < (line.origin - 0.25 * line.height): - cp["superscript"] = True - elif char.origin.x > (line.origin + 0.15 * line.height): - cp["subscript"] = True - elif char.origin.y > (line.origin + 0.25 * line.height): - cp["superscript"] = True - elif char.origin.y < (line.origin - 0.15 * line.height): - cp["subscript"] = True - - return cp - - def _ast_filtered(self, area: Rectangle, with_graphics=True, - ignore_xpos=False, with_bits=True, with_notes=True) -> list: + def ast_in_area(self, area: Rectangle, with_graphics: bool = True, + ignore_xpos: bool = False, with_bits: bool = True, + with_notes: bool = True) -> Node: x_em = self._spacing["x_em"] spacing_content = self._spacing["x_content"] lh_factor = self._spacing["lh"] @@ -753,8 +545,9 @@ def parent_name(current): current = root ypos = area.top - for obj in self._objects_filtered(area, with_graphics): + for obj in self.objects_in_area(area, with_graphics): xpos = round(obj.bbox.left) + # Tables should remain in their current hierarchy regardless of indentation if isinstance(obj, (Table, Figure)): current = next((c for c in current.iter_path_reverse() @@ -763,6 +556,7 @@ def parent_name(current): Node(name, parent=current, obj=obj, xpos=xpos, number=-1, _width=obj.bbox.width / area.width, _type=obj._type) ypos = obj.bbox.bottom + # Lines of text need to be carefully checked for indentation elif isinstance(obj, CharLine): newlines = round((ypos - obj.origin) / (lh_factor * obj.height)) @@ -783,6 +577,7 @@ def parent_name(current): current = current.parent.parent # print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content) + # Check if line is a heading, which may be multi-line, so we must # be careful not to nest them, but group them properly # Headings are always inserted into the root note! @@ -853,15 +648,15 @@ def parent_name(current): else: # Default back to the regex if "Reserved" not in content: - LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}") + _LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}") content_start = re.match(r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content) if content_start is None: - LOGGER.error(f"Unable to match Bit regex at all! '{content}'!") + _LOGGER.error(f"Unable to match Bit regex at all! '{content}'!") content_start = 0 else: content_start = len(content_start.group(0)) if not content_start: - LOGGER.error(f"Missing content start (=0)! '{content}'!") + _LOGGER.error(f"Missing content start (=0)! '{content}'!") content_start = min(content_start, len(obj.chars) - 1) current = next((c for c in current.iter_path_reverse() @@ -895,4 +690,4 @@ def parent_name(current): return root def __repr__(self) -> str: - return f"StPage({self.number})" + return f"StmPage({self.number})" diff --git a/src/modm_data/pdf2html/stmicro/table.py b/src/modm_data/pdf2html/table.py similarity index 82% rename from src/modm_data/pdf2html/stmicro/table.py rename to src/modm_data/pdf2html/table.py index e0744b5..6aa0995 100644 --- a/src/modm_data/pdf2html/stmicro/table.py +++ b/src/modm_data/pdf2html/table.py @@ -5,112 +5,10 @@ import statistics from functools import cached_property from collections import defaultdict -from ...utils import HLine, VLine, Rectangle - -LOGGER = logging.getLogger(__name__) - - -class TableCell: - class Borders: - def __init__(self, l, b, r, t): - self.l = l - self.b = b - self.r = r - self.t = t - - def __init__(self, table, position, bbox, borders, is_simple=False): - self._table = table - self._bboxes = [bbox] - self.b = borders - self.positions = [position] - self.is_header = False - self._is_simple = is_simple - self._bbox = None - self._lines = None - - def _merge(self, other): - self.positions.extend(other.positions) - self.positions.sort() - self._bboxes.append(other.bbox) - self._bbox = None - self._lines = None - - def _move(self, x, y): - self.positions = [(py + y, px + x) for (py, px) in self.positions] - self.positions.sort() - - def _expand(self, dx, dy): - ymax, xmax = self.positions[-1] - for yi in range(ymax, ymax + dy + 1): - for xi in range(xmax, xmax + dx + 1): - self.positions.append((yi, xi)) - self.positions.sort() +from ..utils import HLine, VLine, Rectangle +from .cell import TableCell - @property - def x(self) -> int: - return self.positions[0][1] - - @property - def y(self) -> int: - return self.positions[0][0] - - @property - def xspan(self) -> int: - return self.positions[-1][1] - self.positions[0][1] + 1 - - @property - def yspan(self) -> int: - return self.positions[-1][0] - self.positions[0][0] + 1 - - @property - def rotation(self) -> int: - if not self.lines: return 0 - return self.lines[0].rotation - - @property - def bbox(self) -> Rectangle: - if self._bbox is None: - self._bbox = Rectangle(min(bbox.left for bbox in self._bboxes), - min(bbox.bottom for bbox in self._bboxes), - max(bbox.right for bbox in self._bboxes), - max(bbox.top for bbox in self._bboxes)) - return self._bbox - - @property - def lines(self): - if self._lines is None: - self._lines = self._table._page._charlines_filtered(self.bbox) - return self._lines - - @property - def content(self): - return "".join(c.char for line in self.lines for c in line.chars) - - @property - def left_aligned(self): - x_em = self._table._page._spacing["x_em"] - for line in self.lines: - if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right): - return True - return False - - @property - def ast(self): - ast = self._table._page._ast_filtered(self.bbox, with_graphics=False, - ignore_xpos=not self.left_aligned, - with_bits=False, with_notes=False) - ast.name = "cell" - return ast - - def __repr__(self) -> str: - positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions) - borders = "" - if self.b.l: borders += "[" - if self.b.b: borders += "_" - if self.b.t: borders += "^" - if self.b.r: borders += "]" - start = "CellH" if self.is_header else "Cell" - return start + f"[{positions}] {borders}" +_LOGGER = logging.getLogger(__name__) class Table: @@ -143,26 +41,26 @@ def _cluster(lines, key): # Find the positions of the top numbers clusters = [] - if lines := self._page._charlines_filtered(cbbox): + if lines := self._page.charlines_in_area(cbbox): if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)): clusters.append((cluster, cbbox)) else: self.grid = (0, 0) - LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") + _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})") # Find the positions of the second row of numbers if len(ygrid) > 2: for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])): nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y, self.bbox.right, ygrid[ypos1][0].p0.y) - if lines := self._page._charlines_filtered(nbbox): + if lines := self._page.charlines_in_area(nbbox): if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars): if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16: clusters.append((cluster, nbbox)) self._bit_headers = len(ygrid) - yi - 1 else: self.grid = (len(cluster), 0) - LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})") + _LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})") break # Merge these clusters to find their positions @@ -235,7 +133,7 @@ def _fix_borders(self, cells, x: int, y: int): r = cells[(x + 1, y)].b if cells[(x + 1, y)] is not None else TableCell.Borders(0, 0, 1, 0) t = cells[(x, y + 1)].b if cells[(x, y + 1)] is not None else TableCell.Borders(0, 1, 0, 0) - # if (not c.t and c.l and c.r and c.b) and "Reset value" in cell.content: + # if (not c.t and csand c.r and c.b) and "Reset value" in cell.content: # c.t = 1 # Open at the top into a span @@ -401,7 +299,7 @@ def append_bottom(self, other, merge_headers=True) -> bool: print(len(merged_xheaders), merged_xheaders) # If they are not equal length the table layouts are not compatible at all! if len(self_heads) != len(other_heads): - LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") + _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})") return False # We want to stuff/move the cell positions inplace, therefore we start @@ -444,6 +342,7 @@ def _insert_cells(cell, src, dsts, insert_only): assert new_positions assert len(new_positions) == len(set(new_positions)) cell.positions = sorted(new_positions) + cell._invalidate() def _move_cells(cells, own_xpos): if debug: @@ -497,7 +396,7 @@ def _move_cells(cells, own_xpos): def append_side(self, other, expand=False) -> bool: if self.grid[1] != other.grid[1]: if expand: - LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})") + _LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})") ymin = min(self.grid[1], other.grid[1]) ymax = max(self.grid[1], other.grid[1]) etable = other if self.grid[1] > other.grid[1] else self @@ -506,7 +405,7 @@ def append_side(self, other, expand=False) -> bool: cell._expand(0, ymax - ymin) etable.grid = (etable.grid[0], ymax) else: - LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") + _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})") return False # We must move all cells to the right now diff --git a/tools/make/arm.mk b/tools/make/arm.mk index c561bc6..f8afce9 100644 --- a/tools/make/arm.mk +++ b/tools/make/arm.mk @@ -14,6 +14,6 @@ clone-sources-arm: ext/arm/cmsis/ .PHONY: update-sources-arm ## Update all ARM related repositories to the latest version. update-sources-arm: - @(cd ext/arm/cmsis && git pull) & + @(cd ext/arm/cmsis && git fetch && git reset --hard origin/master) & @wait diff --git a/tools/make/common.mk b/tools/make/common.mk index 9f6ffa8..164af89 100644 --- a/tools/make/common.mk +++ b/tools/make/common.mk @@ -1,7 +1,7 @@ # Copyright 2023, Niklas Hauser # SPDX-License-Identifier: MPL-2.0 -### @Utils Utilities \1000 +### @Utils Utilities \1010 log/%: @mkdir -p $@ @@ -34,7 +34,7 @@ venv: $(MAKE) pip-install-frozen .PHONY: clean-venv -# Remove the virtual environment +## Remove the virtual environment clean-venv: @rm -rf .venv @@ -55,13 +55,13 @@ build-homepage: serve-api-docs: @pdoc --mermaid modm_data - +### @Tests Testing \1009 # ================================== Testing ================================== ext/test/regression/: @git clone --depth=1 git@github.com:modm-ext/modm-data-test-docs.git $@ .PHONY: run-regression-tests -## Convert some PDF pages and check against their known HTML. +## @Tests Convert some PDF pages and check against their known HTML. run-regression-tests: ext/test/regression/ @test/convert_html.sh @git diff --exit-code -- test/data/html diff --git a/tools/make/stmicro.mk b/tools/make/stmicro.mk index 8d35f3c..e89eaf6 100644 --- a/tools/make/stmicro.mk +++ b/tools/make/stmicro.mk @@ -27,11 +27,11 @@ clone-sources-stmicro: clone-sources-arm ext/stmicro/cubehal/ ext/stmicro/header .PHONY: update-sources-stmicro ## Update all STMicro related repositories to the latest version. update-sources-stmicro: update-sources-arm - @(cd ext/stmicro/cubehal && git pull) & - @(cd ext/stmicro/header && git pull) & - @(cd ext/stmicro/svd && git pull) & - @(cd ext/stmicro/owl-archive && git pull) & - @(cd ext/stmicro/svd-archive && git pull) & + @(cd ext/stmicro/cubehal && git fetch && git reset --hard origin/main) & + @(cd ext/stmicro/header && git fetch && git reset --hard origin/master) & + @(cd ext/stmicro/svd && git fetch && git reset --hard origin/main) & + @(cd ext/stmicro/owl-archive && git fetch && git reset --hard origin/main) & + @(cd ext/stmicro/svd-archive && git fetch && git reset --hard origin/master) & @wait @@ -71,9 +71,9 @@ clone-sources-stmicro-private: clone-sources-stmicro ext/stmicro/cubemx/ \ .PHONY: update-sources-stmicro-private update-sources-stmicro-private: update-sources-stmicro - @(cd ext/stmicro/cubemx && git pull) & - @(cd ext/stmicro/html-archive && git pull) & - @(cd ext/stmicro/pdf && git pull) & + @(cd ext/stmicro/cubemx && git fetch && git reset --hard origin/main) & + @(cd ext/stmicro/html-archive && git fetch && git reset --hard origin/main) & + @(cd ext/stmicro/pdf && git fetch && git reset --hard origin/main) & @wait @@ -87,18 +87,18 @@ ext/stmicro/html-archive/%: ext/stmicro/pdf/%.pdf log/stmicro/html/ ## archive. The log will be placed in log/stmicro/html/%.txt. convert-stmicro-html-%: ext/stmicro/html-archive/% -stmicro_pdf2html = $(sort $(1:ext/stmicro/pdf/%.pdf=ext/stmicro/html-archive/%)) +stmicro_pdf2html = $(sort $(foreach path,$1,$(path:ext/stmicro/pdf/%.pdf=ext/stmicro/html-archive/%))) .PHONY: convert-stmicro-html-rm ## Convert all STMicro Reference Manual PDFs into HTML. -convert-stmicro-html-rm: $(stmicro_pdf2html $(wildcard ext/stmicro/pdf/RM*.pdf)) +convert-stmicro-html-rm: $(call stmicro_pdf2html,$(wildcard ext/stmicro/pdf/RM*.pdf)) .PHONY: convert-stmicro-html-ds ## Convert all STMicro Datasheet PDFs into HTML. -convert-stmicro-html-ds: $(stmicro_pdf2html $(wildcard ext/stmicro/pdf/DS*.pdf)) +convert-stmicro-html-ds: $(call stmicro_pdf2html,$(wildcard ext/stmicro/pdf/DS*.pdf)) .PHONY: convert-stmicro-html ## Convert all STMicro PDFs into HTML. -convert-stmicro-html: $(stmicro_pdf2html $(wildcard ext/stmicro/pdf/*.pdf)) +convert-stmicro-html: $(call stmicro_pdf2html,$(wildcard ext/stmicro/pdf/*.pdf)) .PHONY: clean-stmicro-html-% ## Remove all STMicro HTML folders of a specific document number. diff --git a/tools/scripts/makefile_help.py b/tools/scripts/makefile_help.py index a70eeeb..d96e2e4 100644 --- a/tools/scripts/makefile_help.py +++ b/tools/scripts/makefile_help.py @@ -14,19 +14,20 @@ def parse_makefiles(makefiles: list[str]): for path in makefiles: content = Path(path).read_text() fcategory = "General" - if (cdoc := re.search(r"### *@([\w-]+) *(.*?) *\\(\d+)\n", content)): - fcategory = cdoc.group(1) - cdocs[fcategory] = (cdoc.group(2), int(cdoc.group(3) or 0)) - - rawdocs = re.findall(r"((?:##.+\n)+)(.+):", content, flags=re.MULTILINE) - for doc, rule in rawdocs: - doc = doc.replace("##", "") - if (category := re.search(r"@([\w-]+)", doc)): - doc = doc.replace(category.group(0), "") - category = category.group(1) - else: - category = fcategory - docs[category][rule] = [l.strip() for l in doc.splitlines()] + for groupcontent in re.split(r"### *@", content): + if (cdoc := re.search(r"^([\w-]+) *(.*?) *\\(\d+)\n", groupcontent)): + fcategory = cdoc.group(1) + cdocs[fcategory] = (cdoc.group(2), int(cdoc.group(3) or 0)) + + rawdocs = re.findall(r"((?:##.+\n)+)(.+):", groupcontent, flags=re.MULTILINE) + for doc, rule in rawdocs: + doc = doc.replace("##", "") + if (category := re.search(r"@([\w-]+)", doc)): + doc = doc.replace(category.group(0), "") + category = category.group(1) + else: + category = fcategory + docs[category][rule] = [l.strip() for l in doc.splitlines()] return dict(docs), cdocs diff --git a/tools/scripts/search_html.py b/tools/scripts/search_html.py index 8c9337b..a89b519 100644 --- a/tools/scripts/search_html.py +++ b/tools/scripts/search_html.py @@ -1,11 +1,5 @@ -# Copyright (c) 2022, Niklas Hauser -# -# This file is part of the modm-data project. -# -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at http://mozilla.org/MPL/2.0/. -# ----------------------------------------------------------------------------- +# Copyright 2022, Niklas Hauser +# SPDX-License-Identifier: MPL-2.0 import re import sys @@ -15,7 +9,6 @@ from pathlib import Path sys.path.append(".") -from modm_data.utils import ext_path from modm_data.html import Document def _format_html(xmlnode, treenode): @@ -93,7 +86,7 @@ def format_document(document): link = etree.Element("link") link.set("rel", "stylesheet") - link.set("href", "ext/stmicro/html/style.css") + link.set("href", "ext/stmicro/html-archive/style.css") head.append(link) body = etree.Element("body") @@ -113,8 +106,8 @@ def main(): parser.add_argument("--html", type=str) args = parser.parse_args() - documents = ext_path("stmicro/html").glob(args.document) - documents = [Document(d) for d in documents] + documents = (Path(__file__).parents[2] / "ext/stmicro/html-archive").absolute() + documents = [Document(d) for d in documents.glob(args.document)] rootnode = anytree.Node("root", document=args.document, chapter=args.chapter, table=args.table)