Coverage for phml\core\parser\hypertext_markup_parser.py: 100%
87 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 11:07 -0600
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 11:07 -0600
1"""Pythonic Hypertext Markup Language (phml) parser."""
3from html.parser import HTMLParser
4from typing import Optional
6from phml.nodes import Comment, DocType, Element, Point, Position, Properties, Root, Text
8self_closing_tags = [
9 "area",
10 "base",
11 "br",
12 "col",
13 "embed",
14 "hr",
15 "img",
16 "input",
17 "link",
18 "meta",
19 "param",
20 "source",
21 "track",
22 "wbr",
23 "command",
24 "keygen",
25 "menuitem",
26]
29def build_point(pos: tuple[int, int], offset: Optional[int] = None) -> Point:
30 """Build a phml.node.Point from a tuple."""
31 return Point(pos[0], pos[1], offset)
34def build_position(
35 start: tuple[int, int, Optional[int]],
36 end: tuple[int, int, Optional[int]],
37 indent: Optional[int] = None,
38) -> Position:
39 """Build a phml.node.Posiiton from two tuples."""
40 return Position(build_point(start), build_point(end), indent)
43def calc_end_of_tag(tag_text: str, cur_pos: tuple[int, int]) -> tuple[int, int]:
44 """Given the current position and the open tag text, this function
45 calculates where the start tag ends.
46 """
47 lines = tag_text.split("\n")
48 line = len(lines) - 1
49 col = len(lines[-1]) + cur_pos[1] if len(lines) == 1 else len(lines[-1])
51 return cur_pos[0] + line, col
54def strip_and_count(data: str, cur_pos: tuple[int, int]) -> tuple[str, int, int]:
55 """This function takes a possibly mutliline string and strips leading and trailing
56 blank lines. Given the current position it will also calculate the line and column
57 taht the data ends at.
58 """
59 lines, cols = 0, len(data) + cur_pos[1]
60 data_lines = data.split("\n")
62 # If multiline data block
63 if len(data_lines) > 1:
65 # remove leading blank lines
66 for idx in range(0, len(data_lines)): # pylint: disable=consider-using-enumerate
67 if data_lines[idx].strip() != "":
68 data_lines = data_lines[idx:]
69 break
70 if idx == len(data_lines) - 1:
71 data_lines = []
72 break
74 # Remove trailing blank lines
75 if len(data_lines) > 0:
76 for idx in range(len(data_lines) - 1, 0, -1):
77 if data_lines[idx].replace("\n", " ").strip() != "":
78 data_lines = data_lines[: idx + 1]
79 break
81 if len(data_lines) > 0:
82 # Get the line and col of the final position
83 lines, cols = len(data_lines) - 1, len(data_lines[-1])
85 data_lines = "\n".join(data_lines)
87 # Else it is a single line data block
88 else:
89 # Is not a blank line
90 if data_lines[0].replace("\n", " ").strip() != "":
91 data_lines = data_lines[0]
93 return data_lines, cur_pos[0] + lines, cols
96class HypertextMarkupParser(HTMLParser):
97 """Custom html parser inherited from the python
98 built-in html.parser.
99 """
101 cur: Root | Element
102 """The current parent element in the recursion."""
104 cur_tags: list
105 """Stack of all open tags. Used for balancing tags."""
107 def __init__(self, *, convert_charrefs=True):
108 super().__init__(convert_charrefs=convert_charrefs)
110 self.cur = Root()
111 self.cur_tags = []
113 def handle_decl(self, decl: str) -> None:
114 if decl.split(" ")[0].lower() == "doctype":
115 tokens = decl.split(" ")
116 if self.cur.type == "root":
117 if len(tokens) > 1:
118 self.cur.children.append(
119 DocType(
120 lang=tokens[1],
121 parent=self.cur,
122 position=build_position(self.getpos(), self.getpos()),
123 )
124 )
125 else:
126 raise Exception("<!doctype> must be in the root!")
128 def handle_starttag(self, tag, attrs):
130 properties: Properties = {}
132 for attr in attrs:
133 if attr[1] is not None:
134 properties[attr[0]] = attr[1] if attr[1] != "no" else False
135 else:
136 properties[attr[0]] = True
138 self.cur.children.append(Element(tag=tag, properties=properties, parent=self.cur))
140 if tag in self_closing_tags:
141 self.cur.children[-1].startend = True
143 self.cur.children[-1].position = build_position(
144 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos())
145 )
146 else:
147 self.cur = self.cur.children[-1]
148 self.cur_tags.append(self.cur)
149 self.cur.position = build_position(self.getpos(), (0, 0))
151 def handle_startendtag(self, tag, attrs):
152 properties: Properties = {}
154 for attr in attrs:
155 if attr[1] is not None:
156 properties[attr[0]] = attr[1] if attr[1] != "no" else False
157 else:
158 properties[attr[0]] = True
160 self.cur.children.append(
161 Element(
162 tag=tag,
163 properties=properties,
164 parent=self.cur,
165 startend=True,
166 position=build_position(
167 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos())
168 ),
169 )
170 )
172 def handle_endtag(self, tag):
173 if tag == self.cur_tags[-1].tag:
174 self.cur.position.end = build_point(self.getpos())
175 self.cur = self.cur.parent
176 self.cur_tags.pop(-1)
177 else:
178 raise Exception(
179 f"Mismatched tags <{self.cur.tag}> and </{tag}> at \
180[{self.getpos()[0]}:{self.getpos()[1]}]"
181 )
183 def handle_data(self, data):
185 data, eline, ecol = strip_and_count(data, self.getpos())
187 if data not in [[], "", None]:
188 self.cur.children.append(
189 Text(
190 data,
191 self.cur,
192 position=build_position(self.getpos(), (eline, ecol)),
193 )
194 )
196 def handle_comment(self, data: str) -> None:
197 data, eline, ecol = strip_and_count(data, self.getpos())
199 if eline == self.getpos()[0]:
200 ecol += 7
201 else:
202 ecol += 3
204 self.cur.children.append(
205 Comment(
206 value=data,
207 parent=self.cur,
208 position=build_position(
209 self.getpos(),
210 (eline, ecol),
211 ),
212 )
213 )