OpusCleaner
OpusCleaner copied to clipboard
Laser filter error: ValueError: could not convert string to float: b''
Filtering fails on some datasets, for example, en-ru OPUS XLEnt
[task 2024-04-17T19:48:57.880Z] [11/12:laser_similarity] Traceback (most recent call last):
[task 2024-04-17T19:48:57.881Z] [11/12:laser_similarity] File "/builds/worker/.local/lib/python3.10/site-packages/opuscleaner/filters/../threshold.py", line 142, in wrapper
[task 2024-04-17T19:48:57.881Z] [11/12:laser_similarity] return fn(*args, **kwargs)
[task 2024-04-17T19:48:57.881Z] [11/12:laser_similarity] File "/builds/worker/.local/lib/python3.10/site-packages/opuscleaner/filters/../threshold.py", line 192, in threshold_scores
[task 2024-04-17T19:48:57.881Z] [11/12:laser_similarity] item[1].score = float(fchild.readline())
[task 2024-04-17T19:48:57.881Z] [11/12:laser_similarity] ValueError: could not convert string to float: b''
Cleaning config:
{
"version": 1,
"files": [
"XLEnt-v1.2.en-ru.en.gz",
"XLEnt-v1.2.en-ru.ru.gz"
],
"filters": [
{
"filter": "remove_empty_lines",
"parameters": {},
"language": null
},
{
"filter": "normalize_whitespace",
"parameters": {
"COLLAPSE": true
},
"language": "en"
},
{
"filter": "normalize_whitespace",
"parameters": {
"COLLAPSE": true
},
"language": "ru"
},
{
"filter": "deescape-special-chars",
"parameters": {
"LANG1": "other"
},
"language": "en"
},
{
"filter": "deescape-special-chars",
"parameters": {
"LANG1": "other"
},
"language": "ru"
},
{
"filter": "remove_frequent_patterns",
"parameters": {
"PATTERN_FILE": "remove_frequent_patterns.txt"
},
"language": null
},
{
"filter": "max_length",
"parameters": {
"MAXLENGTH": 150,
"MINLENGTH": 1
},
"language": null
},
{
"filter": "max_word_length",
"parameters": {
"MAXWORDLENGTH": 150
},
"language": null
},
{
"filter": "fix_wiki",
"parameters": {
"ALWAYS": false,
"FOOTNOTES": true,
"URLS": true,
"WIKILINKS": true,
"CODE": true,
"HEADINGS": true,
"REMOVEEMPTYLINES": true
},
"language": null
},
{
"filter": "alpha_ratio",
"parameters": {
"LANG1": "en",
"LANG2": "ru",
"SRCWORDRAT": 0.4,
"TRGWORDRAT": 0.4,
"SRCALPHARAT": 0.5,
"TRGALPHARAT": 0.5,
"DEBUG": false
},
"language": null
},
{
"filter": "src_trg_ratio",
"parameters": {
"RATIO": 0.5,
"LOG": false
},
"language": null
},
{
"filter": "num_mismatch",
"parameters": {
"RATIO": 1,
"DEBUG": false
},
"language": null
},
{
"filter": "fasttext_filter",
"parameters": {
"FASTTEXT_MODEL_TYPE": "large",
"LANG1": "en",
"LANG2": "ru"
},
"language": null
},
{
"filter": "laser_similarity",
"parameters": {
"THRESHOLD": "0.85",
"SRCLANG": "en",
"TGTLANG": "ru"
},
"language": null
}
]
}