# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import common_affix, conv_sequences
from rapidfuzz._utils import is_none, setupPandas
from rapidfuzz.distance._initialize_py import Editop, Editops
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the length of the longest common subsequence
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
similarity : int
similarity between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if not s1:
return 0
s1, s2 = conv_sequences(s1, s2)
S = (1 << len(s1)) - 1
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
res = bin(S)[-len(s1) :].count("0")
return res if (score_cutoff is None or res >= score_cutoff) else 0
def _block_similarity(
block,
s1,
s2,
score_cutoff=None,
):
if not s1:
return 0
S = (1 << len(s1)) - 1
block_get = block.get
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
res = bin(S)[-len(s1) :].count("0")
return res if (score_cutoff is None or res >= score_cutoff) else 0
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the LCS distance in the range [0, max].
This is calculated as ``max(len1, len2) - similarity``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Examples
--------
Find the LCS distance between two strings:
>>> from rapidfuzz.distance import LCSseq
>>> LCSseq.distance("lewenstein", "levenshtein")
2
Setting a maximum distance allows the implementation to select
a more efficient implementation:
>>> LCSseq.distance("lewenstein", "levenshtein", score_cutoff=1)
2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
sim = similarity(s1, s2)
dist = maximum - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized LCS similarity in the range [1, 0].
This is calculated as ``distance / max(len1, len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if not s1 or not s2:
return 0
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
norm_sim = distance(s1, s2) / maximum
return norm_sim if (score_cutoff is None or norm_sim <= score_cutoff) else 1
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized LCS similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Examples
--------
Find the normalized LCS similarity between two strings:
>>> from rapidfuzz.distance import LCSseq
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein")
0.8181818181818181
Setting a score_cutoff allows the implementation to select
a more efficient implementation:
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
0.0
When a different processor is used s1 and s2 do not have to be strings
>>> LCSseq.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.81818181818181
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
norm_sim = 1.0 - normalized_distance(s1, s2)
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
def _matrix(s1, s2):
if not s1:
return (0, [])
S = (1 << len(s1)) - 1
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
matrix = []
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
matrix.append(S)
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
sim = bin(S)[-len(s1) :].count("0")
return (sim, matrix)
def editops(
s1,
s2,
*,
processor=None,
):
"""
Return Editops describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
editops : Editops
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described in [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import LCSseq
>>> for tag, src_pos, dest_pos in LCSseq.editops("qabxcd", "abycdf"):
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
delete s1[0] s2[0]
delete s1[3] s2[2]
insert s1[4] s2[2]
insert s1[6] s2[5]
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
prefix_len, suffix_len = common_affix(s1, s2)
s1 = s1[prefix_len : len(s1) - suffix_len]
s2 = s2[prefix_len : len(s2) - suffix_len]
sim, matrix = _matrix(s1, s2)
editops = Editops([], 0, 0)
editops._src_len = len(s1) + prefix_len + suffix_len
editops._dest_len = len(s2) + prefix_len + suffix_len
dist = len(s1) + len(s2) - 2 * sim
if dist == 0:
return editops
editop_list = [None] * dist
col = len(s1)
row = len(s2)
while row != 0 and col != 0:
# deletion
if matrix[row - 1] & (1 << (col - 1)):
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
else:
row -= 1
# insertion
if row and not (matrix[row - 1] & (1 << (col - 1))):
dist -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
# match
else:
col -= 1
while col != 0:
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
while row != 0:
dist -= 1
row -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
editops._editops = editop_list
return editops
def opcodes(
s1,
s2,
*,
processor=None,
):
"""
Return Opcodes describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
opcodes : Opcodes
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described in [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import LCSseq
>>> a = "qabxcd"
>>> b = "abycdf"
>>> for tag, i1, i2, j1, j2 in LCSseq.opcodes(a, b):
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
delete a[0:1] (q) b[0:0] ()
equal a[1:3] (ab) b[0:2] (ab)
delete a[3:4] (x) b[2:2] ()
insert a[4:4] () b[2:3] (y)
equal a[4:6] (cd) b[3:5] (cd)
insert a[6:6] () b[5:6] (f)
"""
return editops(s1, s2, processor=processor).as_opcodes()