Skip to content

Inference

create_noisy_columns

create_noisy_columns(data, min_noise, max_noise)

Generate columns for keyless reading from plain data with defined noise range.

Parameters:

Name Type Description Default
data str

Plain text.

required
min_noise int

Minimum noise range value.

required
max_noise int

Maximum noise range value.

required

Returns:

Name Type Description
columns List[str]

Columns for keyless reading.

Source code in src/trecover/utils/inference.py
def create_noisy_columns(data: str, min_noise: int, max_noise: int) -> List[str]:
    """
    Generate columns for keyless reading from plain data with defined noise range.

    Parameters
    ----------
    data : str
        Plain text.
    min_noise : int
        Minimum noise range value.
    max_noise : int
        Maximum noise range value.

    Returns
    -------
    columns : List[str]
        Columns for keyless reading.

    """

    np.random.seed(None)
    columns = list()

    data = re.sub(r'[^A-Za-z]', '', data).lower()

    for symbol in data:
        noise_size = np.random.randint(low=min_noise, high=max_noise, size=1)[0]
        noise_indexes = np.random.choice(list(var.ALPHABET.difference(symbol)), size=noise_size, replace=False)
        columns.append(f"{symbol}{''.join(noise_indexes)}")

    return columns

create_files_noisy_columns

create_files_noisy_columns(
    files, min_noise, max_noise, n_to_show=0
)

Generate columns for keyless reading from plain data contained in the files with defined noise range.

Parameters:

Name Type Description Default
files List[Union[str, Path]]

Paths to files that contain plain data to create noised columns for keyless reading.

required
min_noise int

Minimum noise range value.

required
max_noise int

Maximum noise range value.

required
n_to_show int, default

Maximum number of columns. Zero means no restrictions.

0

Returns:

Name Type Description
files_columns List[List[str]]

Batch of columns for keyless reading.

Source code in src/trecover/utils/inference.py
def create_files_noisy_columns(files: List[Union[str, Path]],
                               min_noise: int,
                               max_noise: int,
                               n_to_show: int = 0
                               ) -> List[List[str]]:
    """
    Generate columns for keyless reading from plain data contained in the files with defined noise range.

    Parameters
    ----------
    files : List[Union[str, Path]]
        Paths to files that contain plain data to create noised columns for keyless reading.
    min_noise : int
        Minimum noise range value.
    max_noise : int
        Maximum noise range value.
    n_to_show : int, default=0
        Maximum number of columns. Zero means no restrictions.

    Returns
    -------
    files_columns : List[List[str]]
        Batch of columns for keyless reading.

    """

    files_columns = list()

    for file in files:
        with open(file) as f:
            data = f.read()

        if n_to_show > 0:
            data = data[:n_to_show]

        columns = create_noisy_columns(data, min_noise, max_noise)

        files_columns.append(columns)

    return files_columns

data_to_columns

data_to_columns(data, separator=' ')

Clean and split noised data.

Parameters:

Name Type Description Default
data str

Noised columns for keyless reading.

required
separator str, default

Separator to split the data into columns.

' '

Returns:

Type Description
List[str]

Columns for keyless reading.

Source code in src/trecover/utils/inference.py
def data_to_columns(data: str, separator: str = ' ') -> List[str]:
    """
    Clean and split noised data.

    Parameters
    ----------
    data : str
        Noised columns for keyless reading.
    separator : str, default=' '
        Separator to split the data into columns.

    Returns
    -------
    List[str]:
        Columns for keyless reading.

    """

    data = re.sub(separator, ' ', data)
    cleaned_data = re.sub(r'[^A-Za-z ]', '', data).lower()

    return cleaned_data.split(' ')

read_files_columns

read_files_columns(files, separator, n_to_show=0)

Read, clean and split noised data contained in the files.

Parameters:

Name Type Description Default
files List[Union[str, Path]]

Paths to files that contain noised data for keyless reading.

required
separator str

Separator to split the data into columns.

required
n_to_show int, default

Maximum number of columns. Zero means no restrictions.

0

Returns:

Name Type Description
files_columns List[List[str]]

Batch of columns for keyless reading.

Source code in src/trecover/utils/inference.py
def read_files_columns(files: List[Union[str, Path]], separator: str, n_to_show: int = 0) -> List[List[str]]:
    """
    Read, clean and split noised data contained in the files.

    Parameters
    ----------
    files : List[Union[str, Path]]
        Paths to files that contain noised data for keyless reading.
    separator : str
        Separator to split the data into columns.
    n_to_show : int, default=0
        Maximum number of columns. Zero means no restrictions.

    Returns
    -------
    files_columns : List[List[str]]
        Batch of columns for keyless reading.

    """

    files_columns = list()

    for file in files:
        with open(file) as f:
            data = f.read()

        columns = data_to_columns(data, separator)

        if n_to_show > 0:
            columns = columns[:n_to_show]

        files_columns.append(columns)

    return files_columns