Coverage for phml\utils\locate\select.py: 100%

209 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 12:51 -0600

1"""utils.select 

2 

3A collection of utilities around querying for specific 

4types of data. 

5""" 

6 

7import re 

8from typing import Callable 

9 

10from phml.nodes import AST, Element, Root 

11from phml.utils.travel import visit_children, walk 

12 

13__all__ = ["query", "query_all", "matches", "parse_specifiers"] 

14 

15 

16def query(tree: AST | Root | Element, specifier: str) -> Element: 

17 """Same as javascripts querySelector. `#` indicates an id and `.` 

18 indicates a class. If they are used alone they match anything. 

19 Any tag can be used by itself or with `#` and/or `.`. You may use 

20 any number of class specifiers, but may only use one id specifier per 

21 tag name. Complex specifiers are accepted are allowed meaning you can 

22 have space seperated specifiers indicating nesting or a parent child 

23 relationship. 

24 

25 Examles: 

26 * `.some-example` matches the first element with the class `some-example` 

27 * `#some-example` matches the first element with the id `some-example` 

28 * `li` matches the first `li` element 

29 * `li.red` matches the first `li` with the class `red` 

30 * `li#red` matches the first `li` with the id `red` 

31 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

32 * `div.form-control input[type="checkbox"]` matches the first `input` with the 

33 attribute `type="checked"` that has a parent `div` with the class `form-control`. 

34 

35 Return: 

36 Element | None: The first element matching the specifier or None if no element was 

37 found. 

38 """ 

39 

40 def all_nodes(current: Element, rules: list, include_self: bool = True): 

41 """Get all nodes starting with the current node.""" 

42 

43 result = None 

44 for node in walk(current): 

45 if node.type == "element" and (include_self or node != current): 

46 result = branch(node, rules) 

47 if result is not None: 

48 break 

49 return result 

50 

51 def all_children(current: Element, rules: list): 

52 """Get all children of the curret node.""" 

53 result = None 

54 for node in visit_children(current): 

55 if node.type == "element": 

56 result = branch(node, rules) 

57 if result is not None: 

58 break 

59 return result 

60 

61 def first_sibling(node: Element, rules: list): 

62 """Get the first sibling following the node.""" 

63 if node.parent is None: 

64 return None 

65 

66 idx = node.parent.children.index(node) 

67 if idx + 1 < len(node.parent.children): 

68 if node.parent.children[idx + 1].type == "element": 

69 return branch(node.parent.children[idx + 1], rules) 

70 return None 

71 

72 def all_siblings(current: Element, rules: list): 

73 """Get all siblings after the current node.""" 

74 if current.parent is None: 

75 return None 

76 

77 result = None 

78 idx = current.parent.children.index(current) 

79 if idx + 1 < len(current.parent.children): 

80 for node in range(idx + 1, len(current.parent.children)): 

81 if current.parent.children[node].type == "element": 

82 result = branch(current.parent.children[node], rules) 

83 if result is not None: 

84 break 

85 return result 

86 

87 def process_dict(rules: list, node: Element): 

88 if is_equal(rules[0], node): 

89 if len(rules) - 1 == 0: 

90 return node 

91 

92 if isinstance(rules[1], dict) or rules[1] == "*": 

93 return ( 

94 all_nodes(node, rules[1:]) 

95 if isinstance(rules[1], dict) 

96 else all_nodes(node, rules[2:], False) 

97 ) 

98 

99 return branch(node, rules[1:]) 

100 return None 

101 

102 def branch(node: Element, rules: list): # pylint: disable=too-many-return-statements 

103 """Based on the current rule, recursively check the nodes. 

104 If on the last rule then return the current valid node. 

105 """ 

106 

107 if len(rules) == 0: 

108 return node 

109 

110 if isinstance(rules[0], dict): 

111 return process_dict(rules, node) 

112 

113 if rules[0] == "*": 

114 return all_nodes(node, rules[1:]) 

115 

116 if rules[0] == ">": 

117 return all_children(node, rules[1:]) 

118 

119 if rules[0] == "+": 

120 return first_sibling(node, rules[1:]) 

121 

122 if rules[0] == "~": 

123 return all_siblings(node, rules[1:]) 

124 

125 if isinstance(tree, AST): 

126 tree = tree.tree 

127 

128 rules = parse_specifiers(specifier) 

129 

130 return all_nodes(tree, rules) 

131 

132 

133def query_all(tree: AST | Root | Element, specifier: str) -> list[Element]: 

134 """Same as javascripts querySelectorAll. `#` indicates an id and `.` 

135 indicates a class. If they are used alone they match anything. 

136 Any tag can be used by itself or with `#` and/or `.`. You may use 

137 any number of class specifiers, but may only use one id specifier per 

138 tag name. Complex specifiers are accepted are allowed meaning you can 

139 have space seperated specifiers indicating nesting or a parent child 

140 relationship. 

141 

142 Examles: 

143 * `.some-example` matches the first element with the class `some-example` 

144 * `#some-example` matches the first element with the id `some-example` 

145 * `li` matches the first `li` element 

146 * `li.red` matches the first `li` with the class `red` 

147 * `li#red` matches the first `li` with the id `red` 

148 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

149 * `div.form-control input[type="checkbox"]` matches the first `input` with the 

150 attribute `type="checked"` that has a parent `div` with the class `form-control`. 

151 

152 Return: 

153 list[Element] | None: The all elements matching the specifier or and empty list if no 

154 elements were found. 

155 """ 

156 

157 def all_nodes(current: Element, rules: list, include_self: bool = True): 

158 """Get all nodes starting with the current node.""" 

159 results = [] 

160 for node in walk(current): 

161 if node.type == "element" and (include_self or node != current): 

162 results.extend(branch(node, rules)) 

163 return results 

164 

165 def all_children(current: Element, rules: list): 

166 """Get all children of the curret node.""" 

167 results = [] 

168 for node in visit_children(current): 

169 if node.type == "element": 

170 results.extend(branch(node, rules)) 

171 return results 

172 

173 def first_sibling(node: Element, rules: list): 

174 """Get the first sibling following the node.""" 

175 if node.parent is None: 

176 return [] 

177 

178 idx = node.parent.children.index(node) 

179 if idx + 1 < len(node.parent.children): 

180 if node.parent.children[idx + 1].type == "element": 

181 result = branch(node.parent.children[idx + 1], rules) 

182 return result 

183 return [] 

184 

185 def all_siblings(current: Element, rules: list): 

186 """Get all siblings after the current node.""" 

187 if current.parent is None: 

188 return [] 

189 

190 results = [] 

191 idx = current.parent.children.index(current) 

192 if idx + 1 < len(current.parent.children): 

193 for node in range(idx + 1, len(current.parent.children)): 

194 if current.parent.children[node].type == "element": 

195 results.extend(branch(current.parent.children[node], rules)) 

196 return results 

197 

198 def process_dict(rules: list, node: Element): 

199 if is_equal(rules[0], node): 

200 if len(rules) - 1 == 0: 

201 return [node] 

202 

203 if isinstance(rules[1], dict) or rules[1] == "*": 

204 return ( 

205 all_nodes(node, rules[1:]) 

206 if isinstance(rules[1], dict) 

207 else all_nodes(node, rules[2:], False) 

208 ) 

209 

210 return branch(node, rules[1:]) 

211 return [] 

212 

213 def branch(node: Element, rules: list): # pylint: disable=too-many-return-statements 

214 """Based on the current rule, recursively check the nodes. 

215 If on the last rule then return the current valid node. 

216 """ 

217 

218 if len(rules) == 0: 

219 return [node] 

220 

221 if isinstance(rules[0], dict): 

222 return process_dict(rules, node) 

223 

224 if rules[0] == "*": 

225 return all_nodes(node, rules[1:]) 

226 

227 if rules[0] == ">": 

228 return all_children(node, rules[1:]) 

229 

230 if rules[0] == "+": 

231 return first_sibling(node, rules[1:]) 

232 

233 if rules[0] == "~": 

234 return all_siblings(node, rules[1:]) 

235 

236 if isinstance(tree, AST): 

237 tree = tree.tree 

238 

239 rules = parse_specifiers(specifier) 

240 result = all_nodes(tree, rules) 

241 return [result[i] for i in range(len(result)) if i == result.index(result[i])] 

242 

243 

244def matches(node: Element, specifier: str) -> bool: 

245 """Works the same as the Javascript matches. `#` indicates an id and `.` 

246 indicates a class. If they are used alone they match anything. 

247 Any tag can be used by itself or with `#` and/or `.`. You may use 

248 any number of class specifiers, but may only use one id specifier per 

249 tag name. Complex specifiers are not supported. Everything in the specifier 

250 must relate to one element/tag. 

251 

252 Examles: 

253 * `.some-example` matches the first element with the class `some-example` 

254 * `#some-example` matches the first element with the id `some-example` 

255 * `li` matches the first `li` element 

256 * `li.red` matches the first `li` with the class `red` 

257 * `li#red` matches the first `li` with the id `red` 

258 * `input[type="checkbox"]` matches the first `input` with the attribute `type="checkbox"` 

259 """ 

260 

261 rules = parse_specifiers(specifier) 

262 

263 if len(rules) > 1: 

264 raise Exception(f"Complex specifier detected and is not allowed.\n{specifier}") 

265 if not isinstance(rules[0], dict): 

266 raise Exception( 

267 "Specifier must only include tag name, classes, id, and or attribute specfiers.\n\ 

268Example: `li.red#sample[class^='form-'][title~='sample']`" 

269 ) 

270 

271 return is_equal(rules[0], node) 

272 

273 

274def is_equal(rule: dict, node: Element) -> bool: 

275 """Checks if a rule is valid on a node. 

276 A rule is a dictionary of possible values and each value must 

277 be valid on the node. 

278 

279 A rule may have a tag, id, classList, and attribute list: 

280 * If the `tag` is provided, the nodes `tag` must match the rules `tag` 

281 * If the `id` is provided, the nodes `id` must match the rules `id` 

282 * If the `classList` is not empty, each class in the `classList` must exist in the nodes 

283 class attribute 

284 * If the `attribute` list is not empty, each attribute in the attribute list with be compared 

285 against the nodes attributes given the `attribute` lists comparators. Below is the list of 

286 possible comparisons. 

287 1. Exists: `[checked]` yields any element that has the attribute `checked` no matter it's 

288 value. 

289 2. Equals: `[checked='no']` yields any element with `checked='no'` 

290 3. Contains: `[class~=sample]` or `[class*=sample]` yields any element with a class 

291 containing `sample` 

292 4. Equal to or startswith value-: `[class|=sample]` yields elements that either have 

293 a class that equals `sample` or or a class that starts with `sample-` 

294 5. Starts with: `[class^=sample]` yields elements with a class that starts with `sample` 

295 6. Ends with: `[class$="sample"]` yields elements with a class that ends wtih `sample` 

296 

297 Args: 

298 rule (dict): The rule to apply to the node. 

299 node (Element): The node the validate. 

300 

301 Returns: 

302 bool: Whether the node passes all the rules in the dictionary. 

303 """ 

304 

305 # Validate tag 

306 if rule["tag"] != "*" and rule["tag"] != node.tag: 

307 return False 

308 

309 # Validate id 

310 if rule["id"] is not None and ("id" not in node.properties or rule["id"] != node["id"]): 

311 return False 

312 

313 # Validate class list 

314 if len(rule["classList"]) > 0: 

315 for klass in rule["classList"]: 

316 if "class" not in node.properties or klass not in node["class"].split(" "): 

317 return False 

318 

319 # Validate all attributes 

320 if len(rule["attributes"]) > 0: 

321 return all( 

322 attr["name"] in node.properties.keys() 

323 and ((attr["compare"] is not None and __validate_attr(attr, node))) 

324 for attr in rule["attributes"] 

325 ) 

326 

327 return True 

328 

329 

330def __validate_attr(attr: dict, node: Element): 

331 if attr["compare"] == "=": 

332 return is_valid_attr( 

333 attr=node[attr["name"]], 

334 sub=attr["value"], 

335 name=attr["name"], 

336 validator=lambda x, y: x == y, 

337 ) 

338 

339 if attr["compare"] == "|=": 

340 return is_valid_attr( 

341 attr=node[attr["name"]], 

342 sub=attr["value"], 

343 name=attr["name"], 

344 validator=lambda x, y: x == y or x.startswith(f"{y}-"), 

345 ) 

346 

347 if attr["compare"] == "^=": 

348 return is_valid_attr( 

349 attr=node[attr["name"]], 

350 sub=attr["value"], 

351 name=attr["name"], 

352 validator=lambda x, y: x.startswith(y), 

353 ) 

354 

355 if attr["compare"] == "$=": 

356 return is_valid_attr( 

357 attr=node[attr["name"]], 

358 sub=attr["value"], 

359 name=attr["name"], 

360 validator=lambda x, y: x.endswith(y), 

361 ) 

362 

363 if attr["compare"] in ["*=", "~="]: 

364 return is_valid_attr( 

365 attr=node[attr["name"]], 

366 sub=attr["value"], 

367 name=attr["name"], 

368 validator=lambda x, y: y in x, 

369 ) 

370 

371 

372def is_valid_attr(attr: str, sub: str, name: str, validator: Callable) -> bool: 

373 """Validate an attribute value with a given string and a validator callable. 

374 If classlist, create list with attribute value seperated on spaces. Otherwise, 

375 the list will only have the attribute value. For each item in the list, check 

376 against validator, if valid add to count. 

377 

378 Returns: 

379 True if the valid count is greater than 0. 

380 """ 

381 list_attributes = ["class"] 

382 

383 compare_values = [attr] 

384 if name in list_attributes: 

385 compare_values = attr.split(" ") 

386 

387 return bool(len([item for item in compare_values if validator(item, sub)]) > 0) 

388 

389 

390def __parse_el_with_attribute(token: str) -> dict: 

391 el_classid_from_attr = re.compile(r"([a-zA-Z0-9_#.-]+)((\[.*\])*)") 

392 el_from_class_from_id = re.compile(r"(#|\.)?([a-zA-Z0-9_-]+)") 

393 attr_compare_val = re.compile(r"\[([a-zA-Z0-9_-]+)([~|^$*]?=)?(\"[^\"]+\"|'[^']+'|[^'\"]+)?\]") 

394 

395 element = { 

396 "tag": "*", 

397 "classList": [], 

398 "id": None, 

399 "attributes": [], 

400 } 

401 

402 res = el_classid_from_attr.match(token) 

403 

404 el_class_id, attrs = res.group(1), res.group(2) 

405 

406 if attrs not in ["", None]: 

407 for attr in attr_compare_val.finditer(attrs): 

408 name, compare, value = attr.groups() 

409 if value is not None: 

410 value = value.lstrip("'\"").rstrip("'\"") 

411 element["attributes"].append( 

412 { 

413 "name": name, 

414 "compare": compare, 

415 "value": value, 

416 } 

417 ) 

418 

419 if el_class_id not in ["", None]: 

420 for item in el_from_class_from_id.finditer(el_class_id): 

421 if item.group(1) == ".": 

422 if item.group(2) not in element["classList"]: 

423 element["classList"].append(item.group(2)) 

424 elif item.group(1) == "#": 

425 if element["id"] is None: 

426 element["id"] = item.group(2) 

427 else: 

428 raise Exception( 

429 f"There may only be one id per element specifier.\n{token}" 

430 ) 

431 else: 

432 element["tag"] = item.group(2) or "*" 

433 

434 return element 

435 

436 

437def __parse_attr_only_element(token: str) -> dict: 

438 attr_compare_val = re.compile(r"\[([a-zA-Z0-9_-]+)([~|^$*]?=)?(\"[^\"]+\"|'[^']+'|[^'\"]+)?\]") 

439 

440 element = { 

441 "tag": None, 

442 "classList": [], 

443 "id": None, 

444 "attributes": [], 

445 } 

446 

447 element["tag"] = "*" 

448 

449 if token not in ["", None]: 

450 for attr in attr_compare_val.finditer(token): 

451 name, compare, value = attr.groups() 

452 if value is not None: 

453 value = value.lstrip("'\"").rstrip("'\"") 

454 element["attributes"].append( 

455 { 

456 "name": name, 

457 "compare": compare, 

458 "value": value, 

459 } 

460 ) 

461 

462 return element 

463 

464 

465def parse_specifiers(specifier: str) -> dict: 

466 """ 

467 Rules: 

468 * `*` = any element 

469 * `>` = Everything with certain parent child relationship 

470 * `+` = first sibling 

471 * `~` = All after 

472 * `.` = class 

473 * `#` = id 

474 * `[attribute]` = all elements with attribute 

475 * `[attribute=value]` = all elements with attribute=value 

476 * `[attribute~=value]` = all elements with attribute containing value 

477 * `[attribute|=value]` = all elements with attribute=value or attribute starting with value- 

478 * `node[attribute^=value]` = all elements with attribute starting with value 

479 * `node[attribute$=value]` = all elements with attribute ending with value 

480 * `node[attribute*=value]` = all elements with attribute containing value 

481 

482 """ 

483 

484 splitter = re.compile(r"([~>\*+])|(([.#]?[a-zA-Z0-9_-]+)+((\[[^\[\]]+\]))*)|(\[[^\[\]]+\])+") 

485 

486 el_only_attr = re.compile(r"((\[[^\[\]]+\]))+") 

487 el_with_attr = re.compile(r"([.#]?[a-zA-Z0-9_-]+)+(\[[^\[\]]+\])*") 

488 

489 tokens = [] 

490 for token in splitter.finditer(specifier): 

491 if token.group() in ["*", ">", "+", "~"]: 

492 tokens.append(token.group()) 

493 elif el_with_attr.match(token.group()): 

494 tokens.append(__parse_el_with_attribute(token.group())) 

495 elif el_only_attr.match(token.group()): 

496 tokens.append(__parse_attr_only_element(token.group())) 

497 

498 return tokens