Coverage for phml\core\parser\hypertext_markup_parser.py: 100%

87 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 11:07 -0600

1"""Pythonic Hypertext Markup Language (phml) parser.""" 

2 

3from html.parser import HTMLParser 

4from typing import Optional 

5 

6from phml.nodes import Comment, DocType, Element, Point, Position, Properties, Root, Text 

7 

8self_closing_tags = [ 

9 "area", 

10 "base", 

11 "br", 

12 "col", 

13 "embed", 

14 "hr", 

15 "img", 

16 "input", 

17 "link", 

18 "meta", 

19 "param", 

20 "source", 

21 "track", 

22 "wbr", 

23 "command", 

24 "keygen", 

25 "menuitem", 

26] 

27 

28 

29def build_point(pos: tuple[int, int], offset: Optional[int] = None) -> Point: 

30 """Build a phml.node.Point from a tuple.""" 

31 return Point(pos[0], pos[1], offset) 

32 

33 

34def build_position( 

35 start: tuple[int, int, Optional[int]], 

36 end: tuple[int, int, Optional[int]], 

37 indent: Optional[int] = None, 

38) -> Position: 

39 """Build a phml.node.Posiiton from two tuples.""" 

40 return Position(build_point(start), build_point(end), indent) 

41 

42 

43def calc_end_of_tag(tag_text: str, cur_pos: tuple[int, int]) -> tuple[int, int]: 

44 """Given the current position and the open tag text, this function 

45 calculates where the start tag ends. 

46 """ 

47 lines = tag_text.split("\n") 

48 line = len(lines) - 1 

49 col = len(lines[-1]) + cur_pos[1] if len(lines) == 1 else len(lines[-1]) 

50 

51 return cur_pos[0] + line, col 

52 

53 

54def strip_and_count(data: str, cur_pos: tuple[int, int]) -> tuple[str, int, int]: 

55 """This function takes a possibly mutliline string and strips leading and trailing 

56 blank lines. Given the current position it will also calculate the line and column 

57 taht the data ends at. 

58 """ 

59 lines, cols = 0, len(data) + cur_pos[1] 

60 data_lines = data.split("\n") 

61 

62 # If multiline data block 

63 if len(data_lines) > 1: 

64 

65 # remove leading blank lines 

66 for idx in range(0, len(data_lines)): # pylint: disable=consider-using-enumerate 

67 if data_lines[idx].strip() != "": 

68 data_lines = data_lines[idx:] 

69 break 

70 if idx == len(data_lines) - 1: 

71 data_lines = [] 

72 break 

73 

74 # Remove trailing blank lines 

75 if len(data_lines) > 0: 

76 for idx in range(len(data_lines) - 1, 0, -1): 

77 if data_lines[idx].replace("\n", " ").strip() != "": 

78 data_lines = data_lines[: idx + 1] 

79 break 

80 

81 if len(data_lines) > 0: 

82 # Get the line and col of the final position 

83 lines, cols = len(data_lines) - 1, len(data_lines[-1]) 

84 

85 data_lines = "\n".join(data_lines) 

86 

87 # Else it is a single line data block 

88 else: 

89 # Is not a blank line 

90 if data_lines[0].replace("\n", " ").strip() != "": 

91 data_lines = data_lines[0] 

92 

93 return data_lines, cur_pos[0] + lines, cols 

94 

95 

96class HypertextMarkupParser(HTMLParser): 

97 """Custom html parser inherited from the python 

98 built-in html.parser. 

99 """ 

100 

101 cur: Root | Element 

102 """The current parent element in the recursion.""" 

103 

104 cur_tags: list 

105 """Stack of all open tags. Used for balancing tags.""" 

106 

107 def __init__(self, *, convert_charrefs=True): 

108 super().__init__(convert_charrefs=convert_charrefs) 

109 

110 self.cur = Root() 

111 self.cur_tags = [] 

112 

113 def handle_decl(self, decl: str) -> None: 

114 if decl.split(" ")[0].lower() == "doctype": 

115 tokens = decl.split(" ") 

116 if self.cur.type == "root": 

117 if len(tokens) > 1: 

118 self.cur.children.append( 

119 DocType( 

120 lang=tokens[1], 

121 parent=self.cur, 

122 position=build_position(self.getpos(), self.getpos()), 

123 ) 

124 ) 

125 else: 

126 raise Exception("<!doctype> must be in the root!") 

127 

128 def handle_starttag(self, tag, attrs): 

129 

130 properties: Properties = {} 

131 

132 for attr in attrs: 

133 if attr[1] is not None: 

134 properties[attr[0]] = attr[1] if attr[1] != "no" else False 

135 else: 

136 properties[attr[0]] = True 

137 

138 self.cur.children.append(Element(tag=tag, properties=properties, parent=self.cur)) 

139 

140 if tag in self_closing_tags: 

141 self.cur.children[-1].startend = True 

142 

143 self.cur.children[-1].position = build_position( 

144 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos()) 

145 ) 

146 else: 

147 self.cur = self.cur.children[-1] 

148 self.cur_tags.append(self.cur) 

149 self.cur.position = build_position(self.getpos(), (0, 0)) 

150 

151 def handle_startendtag(self, tag, attrs): 

152 properties: Properties = {} 

153 

154 for attr in attrs: 

155 if attr[1] is not None: 

156 properties[attr[0]] = attr[1] if attr[1] != "no" else False 

157 else: 

158 properties[attr[0]] = True 

159 

160 self.cur.children.append( 

161 Element( 

162 tag=tag, 

163 properties=properties, 

164 parent=self.cur, 

165 startend=True, 

166 position=build_position( 

167 self.getpos(), calc_end_of_tag(self.get_starttag_text(), self.getpos()) 

168 ), 

169 ) 

170 ) 

171 

172 def handle_endtag(self, tag): 

173 if tag == self.cur_tags[-1].tag: 

174 self.cur.position.end = build_point(self.getpos()) 

175 self.cur = self.cur.parent 

176 self.cur_tags.pop(-1) 

177 else: 

178 raise Exception( 

179 f"Mismatched tags <{self.cur.tag}> and </{tag}> at \ 

180[{self.getpos()[0]}:{self.getpos()[1]}]" 

181 ) 

182 

183 def handle_data(self, data): 

184 

185 data, eline, ecol = strip_and_count(data, self.getpos()) 

186 

187 if data not in [[], "", None]: 

188 self.cur.children.append( 

189 Text( 

190 data, 

191 self.cur, 

192 position=build_position(self.getpos(), (eline, ecol)), 

193 ) 

194 ) 

195 

196 def handle_comment(self, data: str) -> None: 

197 data, eline, ecol = strip_and_count(data, self.getpos()) 

198 

199 if eline == self.getpos()[0]: 

200 ecol += 7 

201 else: 

202 ecol += 3 

203 

204 self.cur.children.append( 

205 Comment( 

206 value=data, 

207 parent=self.cur, 

208 position=build_position( 

209 self.getpos(), 

210 (eline, ecol), 

211 ), 

212 ) 

213 )