···
text: str = post.get('text', '')
71
-
text = text.encode(encoding='utf-8').decode(encoding='utf-8')
71
+
ut8_text = text.encode(encoding='utf-8')
73
+
def decode(ut8: bytes) -> str:
74
+
return ut8.decode(encoding='utf-8')
facets: list[dict] = post.get('facets', [])
75
-
return [cross.TextToken(text)]
78
+
return [cross.TextToken(decode(ut8_text))]
slices: list[tuple[int, int, str, str]] = []
···
slices.append((index['byteStart'], index['byteEnd'], 'mention', feature['did']))
97
-
return [cross.TextToken(text)]
100
+
return [cross.TextToken(decode(ut8_text))]
slices.sort(key=lambda s: s[0])
unique: list[tuple[int, int, str, str]] = []
···
108
-
return [cross.TextToken(text)]
111
+
return [cross.TextToken(decode(ut8_text))]
tokens: list[cross.Token] = []
···
for start, end, ttype, val in unique:
116
-
tokens.append(cross.TextToken(text[prev:start]))
119
+
tokens.append(cross.TextToken(decode(ut8_text[prev:start])))
120
-
label = text[start:end]
123
+
label = decode(ut8_text[start:end])
···
tokens.append(cross.LinkToken(val, label))
132
-
tokens.append(cross.TagToken(text[start:end]))
136
+
tokens.append(cross.TagToken(decode(ut8_text[start:end])))
134
-
tokens.append(cross.MentionToken(text[start:end], val))
138
+
tokens.append(cross.MentionToken(decode(ut8_text[start:end]), val))
137
-
if prev < len(text):
138
-
tokens.append(cross.TextToken(text[prev:]))
141
+
if prev < len(ut8_text):
142
+
tokens.append(cross.TextToken(decode(ut8_text[prev:])))