Introduction to String Similarity
String similarity is a critical concept in computer science and software development. It involves measuring how closely related or similar two strings (sequences of characters) are.
String Similarity APIs
In this section, we will explore several useful APIs that can help you compute string similarities effectively.
1. Levenshtein Distance
The Levenshtein distance is a popular metric for measuring the difference between two sequences.
import Levenshtein
def levenshtein_distance(str1, str2):
return Levenshtein.distance(str1, str2)
# Example usage
result = levenshtein_distance("kitten", "sitting")
print(f"Levenshtein Distance: {result}")
2. Jaccard Similarity
The Jaccard similarity coefficient measures similarity between two sets by dividing the intersection of the sets by their union.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import jaccard_score
def jaccard_similarity(str1, str2):
vectorizer = CountVectorizer().fit_transform([str1, str2])
vectors = vectorizer.toarray()
return jaccard_score(vectors[0], vectors[1], average='binary')
# Example usage
result = jaccard_similarity("I love coding", "I love programming")
print(f"Jaccard Similarity: {result}")
3. Cosine Similarity
Cosine similarity measures the cosine of the angle between two non-zero vectors of an inner product space.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def cosine_sim(str1, str2):
vectors = TfidfVectorizer().fit_transform([str1, str2])
vectors = vectors.toarray()
return cosine_similarity([vectors[0]], [vectors[1]])[0][0]
# Example usage
result = cosine_sim("machine learning", "deep learning")
print(f"Cosine Similarity: {result}")
4. Hamming Distance
The Hamming distance is used for error detection or error correction in coding theory.
def hamming_distance(str1, str2):
if len(str1) != len(str2):
raise ValueError("Strings must be of the same length")
return sum(el1 != el2 for el1, el2 in zip(str1, str2))
# Example usage
result = hamming_distance("101010", "100100")
print(f"Hamming Distance: {result}")
App Example Combining All Introduced APIs
Here we will create a simple app that uses all of the above APIs to compare user input strings.
from flask import Flask, request, jsonify
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import jaccard_score, cosine_similarity
import Levenshtein
app = Flask(__name__)
@app.route('/compare', methods=['POST'])
def compare_strings():
data = request.json
str1 = data['str1']
str2 = data['str2']
results = {
'levenshtein_distance': Levenshtein.distance(str1, str2),
'jaccard_similarity': jaccard_similarity(str1, str2),
'cosine_similarity': cosine_sim(str1, str2),
'hamming_distance': hamming_distance(str1, str2) if len(str1) == len(str2) else 'N/A (unequal length)'
}
return jsonify(results)
def jaccard_similarity(str1, str2):
vectorizer = CountVectorizer().fit_transform([str1, str2])
vectors = vectorizer.toarray()
return jaccard_score(vectors[0], vectors[1], average='binary')
def cosine_sim(str1, str2):
vectors = TfidfVectorizer().fit_transform([str1, str2])
vectors = vectors.toarray()
return cosine_similarity([vectors[0]], [vectors[1]])[0][0]
def hamming_distance(str1, str2):
if len(str1) != len(str2):
raise ValueError("Strings must be of the same length")
return sum(el1 != el2 for el1, el2 in zip(str1, str2))
if __name__ == '__main__':
app.run(debug=True)
With this app, you can compare strings using multiple similarity metrics seamlessly.
Hash: f3d1731459f5b665c4702aacfe62c26df6f95b964891446a0b7151cbf4ac8559