Transformer-DeID: Deidentification of free-text clinical notes with transformers 1.0.0
(2,216 bytes)
"""Classes and functions for working with entity labels."""
class Label(object):
"""Base class for a label.
A label contains four attributes of primary interest:
* entity_type - the type of entity which is labeled
* start - the start offset of the label in the source text
* length - the length of the label in the source text
* entity - the actual text of the entity
"""
def __init__(self, entity_type, start, length, entity):
"""Initialize a data processor with the location of the data."""
self.entity_type = entity_type
self.start = start
self.length = length
self.entity = entity
def __repr__(self):
return f'Label({self.entity_type}, {self.start}, {self.length}, {self.entity})'
def map_entity_type(self, mapping, force_upper=True):
if force_upper:
self.entity_type = mapping[self.entity_type.upper()]
else:
self.entity_type = mapping[self.entity_type]
def shift(self, s):
self.start += s
return self
def contains(self, i):
"""Returns true if any label contains the offset."""
return (self.start >= i) & ((self.start + self.length) < i)
def overlaps(self, start, stop):
"""Returns true if any label contains the start/stop offset."""
contains_start = (self.start >= start) & (self.start < stop)
contains_stop = ((self.start + self.length) >=
start) & ((self.start + self.length) < stop)
return contains_start | contains_stop
def within(self, start, stop):
"""Returns true if the label is within a start/stop offset."""
after_start = (self.start >= start) or ((self.start + self.length) >= start)
before_stop = self.start < stop
return after_start & before_stop
def convert_to_bio_scheme(tokens: list) -> list:
def b_or_i(w, w_prev):
if w == 'O':
return 'O'
elif w == w_prev:
return f'I-{w}'
else:
return f'B-{w}'
return [
[b_or_i(w, None if i == 0 else sequence[i-1]) for i, w in enumerate(sequence)]
for sequence in tokens
]