Coverage for phml\transform\sanitize\clean.py: 100%
74 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 18:56 -0600
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 18:56 -0600
1from re import match
2from typing import Optional
4from phml.nodes import AST, Element, Root
6from .schema import Schema
9def sanatize(tree: AST | Root | Element, schema: Optional[Schema] = Schema()):
10 """Sanatize elements and attributes in the phml tree. Should be used when using
11 data from an unkown source. It should be used with an AST that has already been
12 compiled to html to no unkown values are unchecked.
14 By default the sanatization schema uses the github schema and follows the hast
15 sanatize utility.
17 * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js)
18 * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize)
20 Note:
21 This utility will edit the tree in place.
23 Args:
24 tree (AST | Root | Element): The root of the tree that will be sanatized.
25 schema (Optional[Schema], optional): User defined schema. Defaults to github schema.
26 """
28 from phml import check, is_element, remove_nodes # pylint: disable=import-outside-toplevel
30 if isinstance(tree, AST):
31 src = tree.tree
32 else:
33 src = tree
35 for strip in schema.strip:
36 remove_nodes(src, ["element", {"tag": strip}])
38 def recurse_check_tag(node: Root | Element):
39 pop_els = []
40 for idx, child in enumerate(node.children):
41 if check(child, "element") and not is_element(child, schema.tag_names):
42 pop_els.append(child)
43 elif check(node.children[idx], "element"):
44 recurse_check_tag(node.children[idx])
46 for element in pop_els:
47 node.children.remove(element)
49 def recurse_check_ancestor(node: Root | Element):
50 pop_els = []
51 for idx, child in enumerate(node.children):
52 if (
53 check(child, "element")
54 and child.tag in schema.ancestors.keys()
55 and (
56 check(child.parent, "root")
57 or child.parent.tag not in schema.ancestors[child.tag]
58 )
59 ):
60 pop_els.append(child)
61 elif check(node.children[idx], "element"):
62 recurse_check_ancestor(node.children[idx])
64 for element in pop_els:
65 node.children.remove(element)
67 def build_valid_attributes(attributes: list) -> list[str]:
68 """Extract attributes from schema."""
69 valid_attrs = []
70 for attribute in attributes:
71 valid_attrs = (
72 [*valid_attrs, attribute]
73 if isinstance(attribute, str)
74 else [*valid_attrs, attribute[0]]
75 )
76 return valid_attrs
78 def build_remove_attr_list(properties: dict, attributes: dict, valid_attributes: list):
79 """Build the list of attributes to remove from a dict of attributes."""
80 result = []
81 for attribute in properties:
82 if attribute not in valid_attributes:
83 result.append(attribute)
84 else:
85 for attr in attributes:
86 if isinstance(attr, list) and attr[0] == attribute and len(attr) > 1:
87 if not all([val == properties[attribute] for val in attr[1:]]) or (
88 attribute in schema.protocols
89 and not check_protocols(properties[attribute], schema.protocols[attribute])
90 ):
91 result.append(attribute)
92 break
93 elif (
94 attr == attribute
95 and attr in schema.protocols
96 and not check_protocols(properties[attribute], schema.protocols[attribute])
97 ):
98 result.append(attribute)
99 break
101 return result
103 def recurse_check_attributes(node: Root | Element):
104 for idx, child in enumerate(node.children):
105 if check(child, "element"):
106 if child.tag in schema.attributes:
107 valid_attributes = build_valid_attributes(schema.attributes[child.tag])
109 pop_attrs = build_remove_attr_list(
110 node.children[idx].properties, schema.attributes[child.tag], valid_attributes
111 )
113 for attribute in pop_attrs:
114 node.children[idx].properties.pop(attribute, None)
116 recurse_check_attributes(node.children[idx])
118 def recurse_check_required(node: Root | Element):
119 for idx, child in enumerate(node.children):
120 if check(child, "element") and child.tag in schema.required:
121 for attr, value in schema.required[child.tag].items():
122 if attr not in child.properties:
123 node.children[idx][attr] = value
125 elif check(node.children[idx], "element"):
126 recurse_check_required(node.children[idx])
128 def check_protocols(value: str, protocols: list[str]):
129 for protocol in protocols:
130 if match(f"{protocol}:.*", value) is not None:
131 return True
132 return False
134 recurse_check_tag(src)
135 recurse_check_ancestor(src)
136 recurse_check_attributes(src)
137 recurse_check_required(src)