from __future__ import annotations
import gzip
from . import InputExample
class PairedFilesReader:
"""Reads in the a Pair Dataset, split in two files"""
def __init__(self, filepaths):
self.filepaths = filepaths
def get_examples(self, max_examples=0):
fIns = []
for filepath in self.filepaths:
fIn = (
gzip.open(filepath, "rt", encoding="utf-8")
if filepath.endswith(".gz")
else open(filepath, encoding="utf-8")
)
fIns.append(fIn)
examples = []
eof = False
while not eof:
texts = []
for fIn in fIns:
text = fIn.readline()
if text == "":
eof = True
break
texts.append(text)
if eof:
break
examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
if max_examples > 0 and len(examples) >= max_examples:
break
return examples