Coverage for phml\transform\sanitize\clean.py: 100%

74 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 18:56 -0600

1from re import match 

2from typing import Optional 

3 

4from phml.nodes import AST, Element, Root 

5 

6from .schema import Schema 

7 

8 

9def sanatize(tree: AST | Root | Element, schema: Optional[Schema] = Schema()): 

10 """Sanatize elements and attributes in the phml tree. Should be used when using 

11 data from an unkown source. It should be used with an AST that has already been 

12 compiled to html to no unkown values are unchecked. 

13 

14 By default the sanatization schema uses the github schema and follows the hast 

15 sanatize utility. 

16 

17 * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js) 

18 * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize) 

19 

20 Note: 

21 This utility will edit the tree in place. 

22 

23 Args: 

24 tree (AST | Root | Element): The root of the tree that will be sanatized. 

25 schema (Optional[Schema], optional): User defined schema. Defaults to github schema. 

26 """ 

27 

28 from phml import check, is_element, remove_nodes # pylint: disable=import-outside-toplevel 

29 

30 if isinstance(tree, AST): 

31 src = tree.tree 

32 else: 

33 src = tree 

34 

35 for strip in schema.strip: 

36 remove_nodes(src, ["element", {"tag": strip}]) 

37 

38 def recurse_check_tag(node: Root | Element): 

39 pop_els = [] 

40 for idx, child in enumerate(node.children): 

41 if check(child, "element") and not is_element(child, schema.tag_names): 

42 pop_els.append(child) 

43 elif check(node.children[idx], "element"): 

44 recurse_check_tag(node.children[idx]) 

45 

46 for element in pop_els: 

47 node.children.remove(element) 

48 

49 def recurse_check_ancestor(node: Root | Element): 

50 pop_els = [] 

51 for idx, child in enumerate(node.children): 

52 if ( 

53 check(child, "element") 

54 and child.tag in schema.ancestors.keys() 

55 and ( 

56 check(child.parent, "root") 

57 or child.parent.tag not in schema.ancestors[child.tag] 

58 ) 

59 ): 

60 pop_els.append(child) 

61 elif check(node.children[idx], "element"): 

62 recurse_check_ancestor(node.children[idx]) 

63 

64 for element in pop_els: 

65 node.children.remove(element) 

66 

67 def build_valid_attributes(attributes: list) -> list[str]: 

68 """Extract attributes from schema.""" 

69 valid_attrs = [] 

70 for attribute in attributes: 

71 valid_attrs = ( 

72 [*valid_attrs, attribute] 

73 if isinstance(attribute, str) 

74 else [*valid_attrs, attribute[0]] 

75 ) 

76 return valid_attrs 

77 

78 def build_remove_attr_list(properties: dict, attributes: dict, valid_attributes: list): 

79 """Build the list of attributes to remove from a dict of attributes.""" 

80 result = [] 

81 for attribute in properties: 

82 if attribute not in valid_attributes: 

83 result.append(attribute) 

84 else: 

85 for attr in attributes: 

86 if isinstance(attr, list) and attr[0] == attribute and len(attr) > 1: 

87 if not all([val == properties[attribute] for val in attr[1:]]) or ( 

88 attribute in schema.protocols 

89 and not check_protocols(properties[attribute], schema.protocols[attribute]) 

90 ): 

91 result.append(attribute) 

92 break 

93 elif ( 

94 attr == attribute 

95 and attr in schema.protocols 

96 and not check_protocols(properties[attribute], schema.protocols[attribute]) 

97 ): 

98 result.append(attribute) 

99 break 

100 

101 return result 

102 

103 def recurse_check_attributes(node: Root | Element): 

104 for idx, child in enumerate(node.children): 

105 if check(child, "element"): 

106 if child.tag in schema.attributes: 

107 valid_attributes = build_valid_attributes(schema.attributes[child.tag]) 

108 

109 pop_attrs = build_remove_attr_list( 

110 node.children[idx].properties, schema.attributes[child.tag], valid_attributes 

111 ) 

112 

113 for attribute in pop_attrs: 

114 node.children[idx].properties.pop(attribute, None) 

115 

116 recurse_check_attributes(node.children[idx]) 

117 

118 def recurse_check_required(node: Root | Element): 

119 for idx, child in enumerate(node.children): 

120 if check(child, "element") and child.tag in schema.required: 

121 for attr, value in schema.required[child.tag].items(): 

122 if attr not in child.properties: 

123 node.children[idx][attr] = value 

124 

125 elif check(node.children[idx], "element"): 

126 recurse_check_required(node.children[idx]) 

127 

128 def check_protocols(value: str, protocols: list[str]): 

129 for protocol in protocols: 

130 if match(f"{protocol}:.*", value) is not None: 

131 return True 

132 return False 

133 

134 recurse_check_tag(src) 

135 recurse_check_ancestor(src) 

136 recurse_check_attributes(src) 

137 recurse_check_required(src)