Move from bs4 to etree

BeautifulSoup when using lxml in XML mode would mis-parse XML with very long attributes. Specifically, a <polygon> with about 18MB in its points attr would make lxml not return anything past that point in the file. bs4 uses lxml, which uses libxml2. libxml2 has a config option for parsing "huge" files that increases buffer sizes and avoids this error, and this option is exposed in lxml, but AFAICT you can't tell bs4 to set it, and bs4 just silently swallows the error from lxml. Fixes one half of #46
author: jaseg <git@jaseg.de> 2024-07-05 11:13:25 +0200
committer: jaseg <git@jaseg.de> 2024-07-05 12:37:42 +0200
commit: bd2b3733ee87bd918c9ea7ca8f871951f7f86bca (patch)
tree: c348aa5d757abceafdd6ee1bd86f0ab5c4a705d5 /gerbolyze
parent: 534c2e4ea3229c6ffef640ea8b55b0c18e02fd7b (diff)
download: gerbolyze-bd2b3733ee87bd918c9ea7ca8f871951f7f86bca.tar.gz
gerbolyze-bd2b3733ee87bd918c9ea7ca8f871951f7f86bca.tar.bz2
gerbolyze-bd2b3733ee87bd918c9ea7ca8f871951f7f86bca.zip
1 files changed, 14 insertions, 7 deletions
diff --git a/gerbolyze/__init__.py b/gerbolyze/__init__.py
index 48ee4f7..40238ed 100755
--- a/gerbolyze/__init__.py
+++ b/gerbolyze/__init__.py
@@ -14,8 +14,8 @@ import warnings
 import shutil
 from zipfile import ZipFile, is_zipfile
 from pathlib import Path
+from xml.etree import ElementTree
 
-from bs4 import BeautifulSoup
 import numpy as np
 import click
 
@@ -23,6 +23,11 @@ import gerbonara as gn
 
 __version__ = '3.1.8'
 
+ET_NS= {'svg': 'http://www.w3.org/2000/svg',
+          'inkscape': 'http://www.inkscape.org/namespaces/inkscape'}
+SVG_NS = f'{{{ET_NS["svg"]}}}'
+INKSCAPE_NS = f'{{{ET_NS["inkscape"]}}}'
+
 @click.group()
 def cli():
     pass
@@ -71,16 +76,17 @@ def paste(input_gerbers, input_svg, output_gerbers, is_zip,
     with tempfile.NamedTemporaryFile(suffix='.svg') as processed_svg:
         run_cargo_command('usvg', *shlex.split(os.environ.get('USVG_OPTIONS', '')), input_svg, processed_svg.name)
 
-        with open(processed_svg.name) as f:
-            soup = BeautifulSoup(f.read(), features='xml')
+        et = ElementTree.parse(processed_svg)
     
         for (side, use), layer in [
                 *stack.graphic_layers.items(),
                 (('drill', 'plated'), stack.drill_pth),
                 (('drill', 'nonplated'), stack.drill_npth)]:
             logging.info(f'Layer {side} {use}')
-            if (soup_layer := soup.find('g', id=f'g-{side}-{use}')):
-                if not soup_layer.contents:
+
+            et_layer = et.find(f".//{SVG_NS}g[@id='g-{side}-{use}']")
+            if et_layer is not None:
+                if not len(et_layer):
                     logging.info(f'    Corresponding overlay layer is empty. Skipping.')
             else:
                 logging.info(f'    Corresponding overlay layer not found. Skipping.')
@@ -271,8 +277,9 @@ def convert(input_svg, output_gerbers, is_zip, dilate, curve_tolerance, subtract
     with tempfile.NamedTemporaryFile(suffix='.svg') as processed_svg:
         run_cargo_command('usvg', *shlex.split(os.environ.get('USVG_OPTIONS', '')), input_svg, processed_svg.name)
 
-        soup = BeautifulSoup(input_svg.read_text(), features='xml')
-        layers = {e.get('id'): e.get('inkscape:label') for e in soup.find_all('g', recursive=True)}
+        et = ElementTree.fromstring(input_svg.read_text())
+        layers = {node.get(f'id'): node.get(f'{INKSCAPE_NS}label')
+                  for node in et.findall(f'{SVG_NS}g')}
 
         stack = gn.LayerStack({}, None, None, [], board_name=input_svg.stem, original_path=input_svg)
author	jaseg <git@jaseg.de>	2024-07-05 11:13:25 +0200
committer	jaseg <git@jaseg.de>	2024-07-05 12:37:42 +0200
commit	bd2b3733ee87bd918c9ea7ca8f871951f7f86bca (patch)
tree	c348aa5d757abceafdd6ee1bd86f0ab5c4a705d5 /gerbolyze
parent	534c2e4ea3229c6ffef640ea8b55b0c18e02fd7b (diff)
download	gerbolyze-bd2b3733ee87bd918c9ea7ca8f871951f7f86bca.tar.gz gerbolyze-bd2b3733ee87bd918c9ea7ca8f871951f7f86bca.tar.bz2 gerbolyze-bd2b3733ee87bd918c9ea7ca8f871951f7f86bca.zip