Coverage for phml\locate\select.py: 99%

212 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 16:33 -0600

1"""utils.select 

2 

3A collection of utilities around querying for specific 

4types of data. 

5""" 

6 

7import re 

8from typing import Callable 

9 

10from phml.nodes import AST, Element, Root 

11from phml.travel.travel import visit_children, walk 

12 

13__all__ = ["query", "query_all", "matches", "parse_specifiers"] 

14 

15 

16def query(tree: AST | Root | Element, specifier: str) -> Element: 

17 """Same as javascripts querySelector. `#` indicates an id and `.` 

18 indicates a class. If they are used alone they match anything. 

19 Any tag can be used by itself or with `#` and/or `.`. You may use 

20 any number of class specifiers, but may only use one id specifier per 

21 tag name. Complex specifiers are accepted are allowed meaning you can 

22 have space seperated specifiers indicating nesting or a parent child 

23 relationship. 

24 

25 Examles: 

26 * `.some-example` matches the first element with the class `some-example` 

27 * `#some-example` matches the first element with the id `some-example` 

28 * `li` matches the first `li` element 

29 * `li.red` matches the first `li` with the class `red` 

30 * `li#red` matches the first `li` with the id `red` 

31 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

32 * `div.form-control input[type="checkbox"]` matches the first `input` with the 

33 attribute `type="checked"` that has a parent `div` with the class `form-control`. 

34 

35 Return: 

36 Element | None: The first element matching the specifier or None if no element was 

37 found. 

38 """ 

39 

40 def all_nodes(current: Element, rules: list, include_self: bool = True): 

41 """Get all nodes starting with the current node.""" 

42 

43 result = None 

44 for node in walk(current): 

45 if node.type == "element" and (include_self or node != current): 

46 result = branch(node, rules) 

47 if result is not None: 

48 break 

49 return result 

50 

51 def all_children(current: Element, rules: list): 

52 """Get all children of the curret node.""" 

53 result = None 

54 for node in visit_children(current): 

55 if node.type == "element": 

56 result = branch(node, rules) 

57 if result is not None: 

58 break 

59 return result 

60 

61 def first_sibling(node: Element, rules: list): 

62 """Get the first sibling following the node.""" 

63 if node.parent is None: 

64 return None 

65 

66 idx = node.parent.children.index(node) 

67 if idx + 1 < len(node.parent.children): 

68 if node.parent.children[idx + 1].type == "element": 

69 return branch(node.parent.children[idx + 1], rules) 

70 return None 

71 

72 def all_siblings(current: Element, rules: list): 

73 """Get all siblings after the current node.""" 

74 if current.parent is None: 

75 return None 

76 

77 result = None 

78 idx = current.parent.children.index(current) 

79 if idx + 1 < len(current.parent.children): 

80 for node in range(idx + 1, len(current.parent.children)): 

81 if current.parent.children[node].type == "element": 

82 result = branch(current.parent.children[node], rules) 

83 if result is not None: 

84 break 

85 return result 

86 

87 def process_dict(rules: list, node: Element): 

88 if is_equal(rules[0], node): 

89 if len(rules) - 1 == 0: 

90 return node 

91 

92 if isinstance(rules[1], dict) or rules[1] == "*": 

93 return ( 

94 all_nodes(node, rules[1:]) 

95 if isinstance(rules[1], dict) 

96 else all_nodes(node, rules[2:], False) 

97 ) 

98 

99 return branch(node, rules[1:]) 

100 return None 

101 

102 def branch(node: Element, rules: list): # pylint: disable=too-many-return-statements 

103 """Based on the current rule, recursively check the nodes. 

104 If on the last rule then return the current valid node. 

105 """ 

106 

107 if len(rules) == 0: 

108 return node 

109 

110 if isinstance(rules[0], dict): 

111 return process_dict(rules, node) 

112 

113 if rules[0] == "*": 

114 return all_nodes(node, rules[1:]) 

115 

116 if rules[0] == ">": 

117 return all_children(node, rules[1:]) 

118 

119 if rules[0] == "+": 

120 return first_sibling(node, rules[1:]) 

121 

122 if rules[0] == "~": 

123 return all_siblings(node, rules[1:]) 

124 

125 return None 

126 

127 if isinstance(tree, AST): 

128 tree = tree.tree 

129 

130 rules = parse_specifiers(specifier) 

131 

132 return all_nodes(tree, rules) 

133 

134 

135def query_all(tree: AST | Root | Element, specifier: str) -> list[Element]: 

136 """Same as javascripts querySelectorAll. `#` indicates an id and `.` 

137 indicates a class. If they are used alone they match anything. 

138 Any tag can be used by itself or with `#` and/or `.`. You may use 

139 any number of class specifiers, but may only use one id specifier per 

140 tag name. Complex specifiers are accepted are allowed meaning you can 

141 have space seperated specifiers indicating nesting or a parent child 

142 relationship. 

143 

144 Examles: 

145 * `.some-example` matches the first element with the class `some-example` 

146 * `#some-example` matches the first element with the id `some-example` 

147 * `li` matches the first `li` element 

148 * `li.red` matches the first `li` with the class `red` 

149 * `li#red` matches the first `li` with the id `red` 

150 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

151 * `div.form-control input[type="checkbox"]` matches the first `input` with the 

152 attribute `type="checked"` that has a parent `div` with the class `form-control`. 

153 

154 Return: 

155 list[Element] | None: The all elements matching the specifier or and empty list if no 

156 elements were found. 

157 """ 

158 

159 def all_nodes(current: Element, rules: list, include_self: bool = True): 

160 """Get all nodes starting with the current node.""" 

161 results = [] 

162 for node in walk(current): 

163 if node.type == "element" and (include_self or node != current): 

164 results.extend(branch(node, rules)) 

165 return results 

166 

167 def all_children(current: Element, rules: list): 

168 """Get all children of the curret node.""" 

169 results = [] 

170 for node in visit_children(current): 

171 if node.type == "element": 

172 results.extend(branch(node, rules)) 

173 return results 

174 

175 def first_sibling(node: Element, rules: list): 

176 """Get the first sibling following the node.""" 

177 if node.parent is None: 

178 return [] 

179 

180 idx = node.parent.children.index(node) 

181 if idx + 1 < len(node.parent.children): 

182 if node.parent.children[idx + 1].type == "element": 

183 result = branch(node.parent.children[idx + 1], rules) 

184 return result 

185 return [] 

186 

187 def all_siblings(current: Element, rules: list): 

188 """Get all siblings after the current node.""" 

189 if current.parent is None: 

190 return [] 

191 

192 results = [] 

193 idx = current.parent.children.index(current) 

194 if idx + 1 < len(current.parent.children): 

195 for node in range(idx + 1, len(current.parent.children)): 

196 if current.parent.children[node].type == "element": 

197 results.extend(branch(current.parent.children[node], rules)) 

198 return results 

199 

200 def process_dict(rules: list, node: Element): 

201 if is_equal(rules[0], node): 

202 if len(rules) - 1 == 0: 

203 return [node] 

204 

205 if isinstance(rules[1], dict) or rules[1] == "*": 

206 return ( 

207 all_nodes(node, rules[1:]) 

208 if isinstance(rules[1], dict) 

209 else all_nodes(node, rules[2:], False) 

210 ) 

211 

212 return branch(node, rules[1:]) 

213 return [] 

214 

215 def branch(node: Element, rules: list): # pylint: disable=too-many-return-statements 

216 """Based on the current rule, recursively check the nodes. 

217 If on the last rule then return the current valid node. 

218 """ 

219 

220 if len(rules) == 0: 

221 return [node] 

222 

223 if isinstance(rules[0], dict): 

224 return process_dict(rules, node) 

225 

226 if rules[0] == "*": 

227 return all_nodes(node, rules[1:]) 

228 

229 if rules[0] == ">": 

230 return all_children(node, rules[1:]) 

231 

232 if rules[0] == "+": 

233 return first_sibling(node, rules[1:]) 

234 

235 if rules[0] == "~": 

236 return all_siblings(node, rules[1:]) 

237 

238 return None 

239 

240 if isinstance(tree, AST): 

241 tree = tree.tree 

242 

243 rules = parse_specifiers(specifier) 

244 result = all_nodes(tree, rules) 

245 return [result[i] for i in range(len(result)) if i == result.index(result[i])] 

246 

247 

248def matches(node: Element, specifier: str) -> bool: 

249 """Works the same as the Javascript matches. `#` indicates an id and `.` 

250 indicates a class. If they are used alone they match anything. 

251 Any tag can be used by itself or with `#` and/or `.`. You may use 

252 any number of class specifiers, but may only use one id specifier per 

253 tag name. Complex specifiers are not supported. Everything in the specifier 

254 must relate to one element/tag. 

255 

256 Examles: 

257 * `.some-example` matches the first element with the class `some-example` 

258 * `#some-example` matches the first element with the id `some-example` 

259 * `li` matches the first `li` element 

260 * `li.red` matches the first `li` with the class `red` 

261 * `li#red` matches the first `li` with the id `red` 

262 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

263 """ 

264 

265 rules = parse_specifiers(specifier) 

266 

267 if len(rules) > 1: 

268 raise Exception(f"Complex specifier detected and is not allowed.\n{specifier}") 

269 if not isinstance(rules[0], dict): 

270 raise Exception( 

271 "Specifier must only include tag name, classes, id, and or attribute specfiers.\n\ 

272Example: `li.red#sample[class^='form-'][title~='sample']`" 

273 ) 

274 

275 return is_equal(rules[0], node) 

276 

277 

278def is_equal(rule: dict, node: Element) -> bool: 

279 """Checks if a rule is valid on a node. 

280 A rule is a dictionary of possible values and each value must 

281 be valid on the node. 

282 

283 A rule may have a tag, id, classList, and attribute list: 

284 * If the `tag` is provided, the nodes `tag` must match the rules `tag` 

285 * If the `id` is provided, the nodes `id` must match the rules `id` 

286 * If the `classList` is not empty, each class in the `classList` must exist in the nodes 

287 class attribute 

288 * If the `attribute` list is not empty, each attribute in the attribute list with be compared 

289 against the nodes attributes given the `attribute` lists comparators. Below is the list of 

290 possible comparisons. 

291 1. Exists: `[checked]` yields any element that has the attribute `checked` no matter it's 

292 value. 

293 2. Equals: `[checked='no']` yields any element with `checked='no'` 

294 3. Contains: `[class~=sample]` or `[class*=sample]` yields any element with a class 

295 containing `sample` 

296 4. Equal to or startswith value-: `[class|=sample]` yields elements that either have 

297 a class that equals `sample` or or a class that starts with `sample-` 

298 5. Starts with: `[class^=sample]` yields elements with a class that starts with `sample` 

299 6. Ends with: `[class$="sample"]` yields elements with a class that ends wtih `sample` 

300 

301 Args: 

302 rule (dict): The rule to apply to the node. 

303 node (Element): The node the validate. 

304 

305 Returns: 

306 bool: Whether the node passes all the rules in the dictionary. 

307 """ 

308 

309 # Validate tag 

310 if rule["tag"] != "*" and rule["tag"] != node.tag: 

311 return False 

312 

313 # Validate id 

314 if rule["id"] is not None and ("id" not in node.properties or rule["id"] != node["id"]): 

315 return False 

316 

317 # Validate class list 

318 if len(rule["classList"]) > 0: 

319 for klass in rule["classList"]: 

320 if "class" not in node.properties or klass not in node["class"].split(" "): 

321 return False 

322 

323 # Validate all attributes 

324 if len(rule["attributes"]) > 0: 

325 return all( 

326 attr["name"] in node.properties.keys() 

327 and ((attr["compare"] is not None and __validate_attr(attr, node))) 

328 for attr in rule["attributes"] 

329 ) 

330 

331 return True 

332 

333 

334def __validate_attr(attr: dict, node: Element): 

335 if attr["compare"] == "=": 

336 return is_valid_attr( 

337 attr=node[attr["name"]], 

338 sub=attr["value"], 

339 name=attr["name"], 

340 validator=lambda x, y: x == y, 

341 ) 

342 

343 if attr["compare"] == "|=": 

344 return is_valid_attr( 

345 attr=node[attr["name"]], 

346 sub=attr["value"], 

347 name=attr["name"], 

348 validator=lambda x, y: x == y or x.startswith(f"{y}-"), 

349 ) 

350 

351 if attr["compare"] == "^=": 

352 return is_valid_attr( 

353 attr=node[attr["name"]], 

354 sub=attr["value"], 

355 name=attr["name"], 

356 validator=lambda x, y: x.startswith(y), 

357 ) 

358 

359 if attr["compare"] == "$=": 

360 return is_valid_attr( 

361 attr=node[attr["name"]], 

362 sub=attr["value"], 

363 name=attr["name"], 

364 validator=lambda x, y: x.endswith(y), 

365 ) 

366 

367 if attr["compare"] in ["*=", "~="]: 

368 return is_valid_attr( 

369 attr=node[attr["name"]], 

370 sub=attr["value"], 

371 name=attr["name"], 

372 validator=lambda x, y: y in x, 

373 ) 

374 

375 return True 

376 

377 

378def is_valid_attr(attr: str, sub: str, name: str, validator: Callable) -> bool: 

379 """Validate an attribute value with a given string and a validator callable. 

380 If classlist, create list with attribute value seperated on spaces. Otherwise, 

381 the list will only have the attribute value. For each item in the list, check 

382 against validator, if valid add to count. 

383 

384 Returns: 

385 True if the valid count is greater than 0. 

386 """ 

387 list_attributes = ["class"] 

388 

389 compare_values = [attr] 

390 if name in list_attributes: 

391 compare_values = attr.split(" ") 

392 

393 return bool(len([item for item in compare_values if validator(item, sub)]) > 0) 

394 

395 

396def __parse_el_with_attribute(token: str) -> dict: 

397 el_classid_from_attr = re.compile(r"([a-zA-Z0-9_#.-]+)((\[.*\])*)") 

398 el_from_class_from_id = re.compile(r"(#|\.)?([a-zA-Z0-9_-]+)") 

399 attr_compare_val = re.compile(r"\[([a-zA-Z0-9_-]+)([~|^$*]?=)?(\"[^\"]+\"|'[^']+'|[^'\"]+)?\]") 

400 

401 element = { 

402 "tag": "*", 

403 "classList": [], 

404 "id": None, 

405 "attributes": [], 

406 } 

407 

408 res = el_classid_from_attr.match(token) 

409 

410 el_class_id, attrs = res.group(1), res.group(2) 

411 

412 if attrs not in ["", None]: 

413 for attr in attr_compare_val.finditer(attrs): 

414 name, compare, value = attr.groups() 

415 if value is not None: 

416 value = value.lstrip("'\"").rstrip("'\"") 

417 element["attributes"].append( 

418 { 

419 "name": name, 

420 "compare": compare, 

421 "value": value, 

422 } 

423 ) 

424 

425 if el_class_id not in ["", None]: 

426 for item in el_from_class_from_id.finditer(el_class_id): 

427 if item.group(1) == ".": 

428 if item.group(2) not in element["classList"]: 

429 element["classList"].append(item.group(2)) 

430 elif item.group(1) == "#": 

431 if element["id"] is None: 

432 element["id"] = item.group(2) 

433 else: 

434 raise Exception(f"There may only be one id per element specifier.\n{token}") 

435 else: 

436 element["tag"] = item.group(2) or "*" 

437 

438 return element 

439 

440 

441def __parse_attr_only_element(token: str) -> dict: 

442 attr_compare_val = re.compile(r"\[([a-zA-Z0-9_-]+)([~|^$*]?=)?(\"[^\"]+\"|'[^']+'|[^'\"]+)?\]") 

443 

444 element = { 

445 "tag": None, 

446 "classList": [], 

447 "id": None, 

448 "attributes": [], 

449 } 

450 

451 element["tag"] = "*" 

452 

453 if token not in ["", None]: 

454 for attr in attr_compare_val.finditer(token): 

455 name, compare, value = attr.groups() 

456 if value is not None: 

457 value = value.lstrip("'\"").rstrip("'\"") 

458 element["attributes"].append( 

459 { 

460 "name": name, 

461 "compare": compare, 

462 "value": value, 

463 } 

464 ) 

465 

466 return element 

467 

468 

469def parse_specifiers(specifier: str) -> dict: 

470 """ 

471 Rules: 

472 * `*` = any element 

473 * `>` = Everything with certain parent child relationship 

474 * `+` = first sibling 

475 * `~` = All after 

476 * `.` = class 

477 * `#` = id 

478 * `[attribute]` = all elements with attribute 

479 * `[attribute=value]` = all elements with attribute=value 

480 * `[attribute~=value]` = all elements with attribute containing value 

481 * `[attribute|=value]` = all elements with attribute=value or attribute starting with value- 

482 * `node[attribute^=value]` = all elements with attribute starting with value 

483 * `node[attribute$=value]` = all elements with attribute ending with value 

484 * `node[attribute*=value]` = all elements with attribute containing value 

485 

486 """ 

487 

488 splitter = re.compile(r"([~>\*+])|(([.#]?[a-zA-Z0-9_-]+)+((\[[^\[\]]+\]))*)|(\[[^\[\]]+\])+") 

489 

490 el_only_attr = re.compile(r"((\[[^\[\]]+\]))+") 

491 el_with_attr = re.compile(r"([.#]?[a-zA-Z0-9_-]+)+(\[[^\[\]]+\])*") 

492 

493 tokens = [] 

494 for token in splitter.finditer(specifier): 

495 if token.group() in ["*", ">", "+", "~"]: 

496 tokens.append(token.group()) 

497 elif el_with_attr.match(token.group()): 

498 tokens.append(__parse_el_with_attribute(token.group())) 

499 elif el_only_attr.match(token.group()): 

500 tokens.append(__parse_attr_only_element(token.group())) 

501 

502 return tokens