Module uim.model.helpers.text_extractor
Expand source code
# -*- coding: utf-8 -*-
# Copyright © 2021 Wacom Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
import uuid
from typing import List, Tuple, Optional
from uim.codec.parser.uim import UIMParser
from uim.model.base import InkModelException
from uim.model.helpers.treeiterator import PreOrderEnumerator
from uim.model.ink import InkModel
from uim.model.semantics.node import InkNode, StrokeGroupNode, StrokeNode
from uim.model.semantics.syntax import WORD, SEMANTIC_IS, TEXT_LINE, SEMANTIC_HAS_URI, \
SEMANTIC_HAS_RELEVANT_CONCEPT, SEMANTIC_HAS_NAMED_ENTITY, CommonViews, SEMANTIC_HAS_TYPE
def uim_extract_text_and_semantics(uim_bytes: bytes, hwr_view: str = CommonViews.HWR_VIEW.value,
ner_view: Optional[str] = None) \
-> Tuple[List[dict], List[dict]]:
"""
Extracting the text from Universal Ink Model.
Parameters
----------
uim_bytes: `bytes`
Byte array with RIFF file from Universal Ink Model
hwr_view: `str`
HWR view.
ner_view: `str`
NER view if needed.
Returns
-------
text: `List[dict]`
List of text lines. Each line has its own dict containing the bounding box, and all words
entities.
Raises
------
`InkModelException`
If the Universal Ink Model does not contain the view with the requested view name.
"""
uim_parser: UIMParser = UIMParser()
ink_object: InkModel = uim_parser.parse(uim_bytes)
return uim_extract_text_and_semantics_from(ink_object, hwr_view, ner_view)
def __collected_stroke_ids__(node: StrokeGroupNode) -> List[uuid.UUID]:
strokes: List[uuid.UUID] = []
for child in node.children:
if isinstance(child, StrokeNode):
strokes.append(child.stroke.id)
elif isinstance(child, StrokeGroupNode):
strokes.extend(__collected_stroke_ids__(child))
return strokes
def uim_extract_text_and_semantics_from(ink_model: InkModel, hwr_view: str = CommonViews.HWR_VIEW.value,
ner_view: Optional[str] = None) -> Tuple[List[dict], List[dict]]:
"""
Extracting the text from Universal Ink Model.
Parameters
----------
ink_model: InkModel -
Universal Ink Model
hwr_view: str -
Name of the HWR view.
ner_view: str -
Name of the NER view if needed.
Returns
-------
tuple(list of text lines (including bounding box), list knowledge uris)
Raises
------
`InkModelException`
If the Universal Ink Model does not contain the view with the requested view name.
Examples
--------
>>> from uim.codec.parser.uim import UIMParser
>>> from uim.model.helpers.text_extractor import uim_extract_text_and_semantics_from
>>> from uim.model.ink import InkModel
>>> from uim.model.semantics.syntax import CommonViews, SEMANTIC_HAS_URI, SEMANTIC_HAS_LABEL, SEMANTIC_HAS_TYPE
>>>
>>> parser: UIMParser = UIMParser()
>>> ink_model: InkModel = parser.parse('../ink/uim_3.1.0/2) Digital Ink is processable 1 (3.1 delta).uim')
>>> if ink_model.has_knowledge_graph():
>>> # Extract text lines and entities from model
>>> text_lines, entities = uim_extract_text_and_semantics_from(ink_model, hwr_view=CommonViews.HWR_VIEW.value,
>>> ner_view=CommonViews.NER_VIEW.value)
>>> line_number: int = 1
>>> print('---------------------------------------------------------------------------------------------------')
>>> print(' Text lines:')
>>> print('---------------------------------------------------------------------------------------------------')
>>> for line in text_lines:
>>> print(f'{line_number}. Text line: {line["line"]} | {line["box"]}')
>>> word_num: int = 1
>>> for word in line['words']:
>>> print(f' {word_num}. Word: {word["word"]} | {word["box"]}')
>>> print(f' -> Stroke UUIDs: {[str(w) for w in word["strokes"]]}')
>>> word_num += 1
>>> line_number += 1
>>> print()
>>> entity_number: int = 1
>>> print('---------------------------------------------------------------------------------------------------')
>>> print(' Entities:')
>>> print('---------------------------------------------------------------------------------------------------')
>>> for entity in entities:
>>> print(f'{entity_number}. URI: {entity["statements"][SEMANTIC_HAS_URI]} - '
>>> f'{entity["statements"][SEMANTIC_HAS_LABEL]} '
>>> f'({entity["statements"][SEMANTIC_HAS_TYPE]})')
>>> entity_number += 1
"""
lines: List[dict] = []
text_nodes: dict = {}
text_line_nodes: list = []
ne_node_mapping: dict = {}
uris_mapping: dict = {}
types: List[dict] = []
type_mapping: dict = {}
for s in ink_model.knowledge_graph.statements:
if s.object == WORD:
all_statements = ink_model.knowledge_graph.all_statements_for(s.subject, predicate=SEMANTIC_IS)
if len(all_statements) == 1:
text_nodes[s.subject] = all_statements[0].object
elif s.object == TEXT_LINE:
text_line_nodes.append(s.subject)
elif s.predicate == SEMANTIC_HAS_NAMED_ENTITY:
if s.subject not in ne_node_mapping:
ne_node_mapping[s.subject] = []
ne_node_mapping[s.subject].append(s.object)
elif s.predicate == SEMANTIC_HAS_URI:
uris_mapping[s.subject] = s.object
elif s.predicate == SEMANTIC_HAS_RELEVANT_CONCEPT:
type_mapping[s.subject] = s.object
try:
root: InkNode = ink_model.view_root(hwr_view)
for node in PreOrderEnumerator(root):
# First find text lines
if node.uri in text_line_nodes:
line: dict = {'line': '', 'box': node.group_bounding_box, 'words': []}
# Add each word to line
for word_node in node.children:
if word_node.uri in text_nodes:
t: str = text_nodes[word_node.uri]
line['words'].append({'word': t, 'box': word_node.group_bounding_box,
'strokes': __collected_stroke_ids__(word_node)})
line['line'] += '{}'.format(t if t in string.punctuation else ' {}'.format(t))
lines.append(line)
except KeyError as e:
raise InkModelException(f'The requested handwriting recognition view does not exist. {e}')
if ner_view is not None:
try:
ner_root: InkNode = ink_model.view_root(ner_view)
for group in PreOrderEnumerator(ner_root):
if isinstance(group, StrokeGroupNode):
for node in group.children:
if node.uri in ne_node_mapping:
for ne_uri in ne_node_mapping[node.uri]:
entity: dict = {'uri': ne_uri, 'statements': {}, 'strokes': []}
# Add statements
statements: list = ink_model.knowledge_graph.all_statements_for(ne_uri)
for st in statements:
if st.predicate in [SEMANTIC_HAS_TYPE]:
if st.predicate not in entity['statements']: # List not yet created
entity['statements'][st.predicate] = [st.object]
else:
entity['statements'][st.predicate].append(st.object)
elif st.predicate in entity['statements']:
if isinstance(entity['statements'][st.predicate], list):
entity['statements'][st.predicate].append(st.object)
else:
first_entry: str = entity['statements'][st.predicate]
# override as list
entity['statements'][st.predicate] = [first_entry, st.object]
else:
entity['statements'][st.predicate] = st.object
if isinstance(node, StrokeGroupNode):
entity['strokes'] = __collected_stroke_ids__(node)
types.append(dict(entity))
except KeyError as e:
raise InkModelException(f'The requested named entity recognition view does not exist. {e}')
return lines, types
Functions
def uim_extract_text_and_semantics(uim_bytes: bytes, hwr_view: str = 'hwr', ner_view: Optional[str] = None) ‑> Tuple[List[dict], List[dict]]
-
Extracting the text from Universal Ink Model.
Parameters
uim_bytes
:bytes
- Byte array with RIFF file from Universal Ink Model
hwr_view
:str
- HWR view.
ner_view
:str
- NER view if needed.
Returns
text
:List[dict]
- List of text lines. Each line has its own dict containing the bounding box, and all words
entities.
Raises
<code>InkModelException</code> If the Universal Ink Model does not contain the view with the requested view name.
Expand source code
def uim_extract_text_and_semantics(uim_bytes: bytes, hwr_view: str = CommonViews.HWR_VIEW.value, ner_view: Optional[str] = None) \ -> Tuple[List[dict], List[dict]]: """ Extracting the text from Universal Ink Model. Parameters ---------- uim_bytes: `bytes` Byte array with RIFF file from Universal Ink Model hwr_view: `str` HWR view. ner_view: `str` NER view if needed. Returns ------- text: `List[dict]` List of text lines. Each line has its own dict containing the bounding box, and all words entities. Raises ------ `InkModelException` If the Universal Ink Model does not contain the view with the requested view name. """ uim_parser: UIMParser = UIMParser() ink_object: InkModel = uim_parser.parse(uim_bytes) return uim_extract_text_and_semantics_from(ink_object, hwr_view, ner_view)
def uim_extract_text_and_semantics_from(ink_model: InkModel, hwr_view: str = 'hwr', ner_view: Optional[str] = None) ‑> Tuple[List[dict], List[dict]]
-
Extracting the text from Universal Ink Model.
Parameters
ink_model
:InkModel -
- Universal Ink Model
hwr_view
:str -
- Name of the HWR view.
ner_view
:str -
- Name of the NER view if needed.
Returns
tuple(list
oftext lines (including bounding box), list knowledge uris)
Raises
<code>InkModelException</code> If the Universal Ink Model does not contain the view with the requested view name.
Examples
>>> from uim.codec.parser.uim import UIMParser >>> from uim.model.helpers.text_extractor import uim_extract_text_and_semantics_from >>> from uim.model.ink import InkModel >>> from uim.model.semantics.syntax import CommonViews, SEMANTIC_HAS_URI, SEMANTIC_HAS_LABEL, SEMANTIC_HAS_TYPE >>> >>> parser: UIMParser = UIMParser() >>> ink_model: InkModel = parser.parse('../ink/uim_3.1.0/2) Digital Ink is processable 1 (3.1 delta).uim') >>> if ink_model.has_knowledge_graph(): >>> # Extract text lines and entities from model >>> text_lines, entities = uim_extract_text_and_semantics_from(ink_model, hwr_view=CommonViews.HWR_VIEW.value, >>> ner_view=CommonViews.NER_VIEW.value) >>> line_number: int = 1 >>> print('---------------------------------------------------------------------------------------------------') >>> print(' Text lines:') >>> print('---------------------------------------------------------------------------------------------------') >>> for line in text_lines: >>> print(f'{line_number}. Text line: {line["line"]} | {line["box"]}') >>> word_num: int = 1 >>> for word in line['words']: >>> print(f' {word_num}. Word: {word["word"]} | {word["box"]}') >>> print(f' -> Stroke UUIDs: {[str(w) for w in word["strokes"]]}') >>> word_num += 1 >>> line_number += 1 >>> print() >>> entity_number: int = 1 >>> print('---------------------------------------------------------------------------------------------------') >>> print(' Entities:') >>> print('---------------------------------------------------------------------------------------------------') >>> for entity in entities: >>> print(f'{entity_number}. URI: {entity["statements"][SEMANTIC_HAS_URI]} - ' >>> f'{entity["statements"][SEMANTIC_HAS_LABEL]} ' >>> f'({entity["statements"][SEMANTIC_HAS_TYPE]})') >>> entity_number += 1
Expand source code
def uim_extract_text_and_semantics_from(ink_model: InkModel, hwr_view: str = CommonViews.HWR_VIEW.value, ner_view: Optional[str] = None) -> Tuple[List[dict], List[dict]]: """ Extracting the text from Universal Ink Model. Parameters ---------- ink_model: InkModel - Universal Ink Model hwr_view: str - Name of the HWR view. ner_view: str - Name of the NER view if needed. Returns ------- tuple(list of text lines (including bounding box), list knowledge uris) Raises ------ `InkModelException` If the Universal Ink Model does not contain the view with the requested view name. Examples -------- >>> from uim.codec.parser.uim import UIMParser >>> from uim.model.helpers.text_extractor import uim_extract_text_and_semantics_from >>> from uim.model.ink import InkModel >>> from uim.model.semantics.syntax import CommonViews, SEMANTIC_HAS_URI, SEMANTIC_HAS_LABEL, SEMANTIC_HAS_TYPE >>> >>> parser: UIMParser = UIMParser() >>> ink_model: InkModel = parser.parse('../ink/uim_3.1.0/2) Digital Ink is processable 1 (3.1 delta).uim') >>> if ink_model.has_knowledge_graph(): >>> # Extract text lines and entities from model >>> text_lines, entities = uim_extract_text_and_semantics_from(ink_model, hwr_view=CommonViews.HWR_VIEW.value, >>> ner_view=CommonViews.NER_VIEW.value) >>> line_number: int = 1 >>> print('---------------------------------------------------------------------------------------------------') >>> print(' Text lines:') >>> print('---------------------------------------------------------------------------------------------------') >>> for line in text_lines: >>> print(f'{line_number}. Text line: {line["line"]} | {line["box"]}') >>> word_num: int = 1 >>> for word in line['words']: >>> print(f' {word_num}. Word: {word["word"]} | {word["box"]}') >>> print(f' -> Stroke UUIDs: {[str(w) for w in word["strokes"]]}') >>> word_num += 1 >>> line_number += 1 >>> print() >>> entity_number: int = 1 >>> print('---------------------------------------------------------------------------------------------------') >>> print(' Entities:') >>> print('---------------------------------------------------------------------------------------------------') >>> for entity in entities: >>> print(f'{entity_number}. URI: {entity["statements"][SEMANTIC_HAS_URI]} - ' >>> f'{entity["statements"][SEMANTIC_HAS_LABEL]} ' >>> f'({entity["statements"][SEMANTIC_HAS_TYPE]})') >>> entity_number += 1 """ lines: List[dict] = [] text_nodes: dict = {} text_line_nodes: list = [] ne_node_mapping: dict = {} uris_mapping: dict = {} types: List[dict] = [] type_mapping: dict = {} for s in ink_model.knowledge_graph.statements: if s.object == WORD: all_statements = ink_model.knowledge_graph.all_statements_for(s.subject, predicate=SEMANTIC_IS) if len(all_statements) == 1: text_nodes[s.subject] = all_statements[0].object elif s.object == TEXT_LINE: text_line_nodes.append(s.subject) elif s.predicate == SEMANTIC_HAS_NAMED_ENTITY: if s.subject not in ne_node_mapping: ne_node_mapping[s.subject] = [] ne_node_mapping[s.subject].append(s.object) elif s.predicate == SEMANTIC_HAS_URI: uris_mapping[s.subject] = s.object elif s.predicate == SEMANTIC_HAS_RELEVANT_CONCEPT: type_mapping[s.subject] = s.object try: root: InkNode = ink_model.view_root(hwr_view) for node in PreOrderEnumerator(root): # First find text lines if node.uri in text_line_nodes: line: dict = {'line': '', 'box': node.group_bounding_box, 'words': []} # Add each word to line for word_node in node.children: if word_node.uri in text_nodes: t: str = text_nodes[word_node.uri] line['words'].append({'word': t, 'box': word_node.group_bounding_box, 'strokes': __collected_stroke_ids__(word_node)}) line['line'] += '{}'.format(t if t in string.punctuation else ' {}'.format(t)) lines.append(line) except KeyError as e: raise InkModelException(f'The requested handwriting recognition view does not exist. {e}') if ner_view is not None: try: ner_root: InkNode = ink_model.view_root(ner_view) for group in PreOrderEnumerator(ner_root): if isinstance(group, StrokeGroupNode): for node in group.children: if node.uri in ne_node_mapping: for ne_uri in ne_node_mapping[node.uri]: entity: dict = {'uri': ne_uri, 'statements': {}, 'strokes': []} # Add statements statements: list = ink_model.knowledge_graph.all_statements_for(ne_uri) for st in statements: if st.predicate in [SEMANTIC_HAS_TYPE]: if st.predicate not in entity['statements']: # List not yet created entity['statements'][st.predicate] = [st.object] else: entity['statements'][st.predicate].append(st.object) elif st.predicate in entity['statements']: if isinstance(entity['statements'][st.predicate], list): entity['statements'][st.predicate].append(st.object) else: first_entry: str = entity['statements'][st.predicate] # override as list entity['statements'][st.predicate] = [first_entry, st.object] else: entity['statements'][st.predicate] = st.object if isinstance(node, StrokeGroupNode): entity['strokes'] = __collected_stroke_ids__(node) types.append(dict(entity)) except KeyError as e: raise InkModelException(f'The requested named entity recognition view does not exist. {e}') return lines, types