django_实现朴素/基本模糊拼写候选/纠错
这只是一个粗糙的玩具 ,不具备智能性
使用到的拼写数据库支持(一角)
数据库模型
from django.db.models.functions import Length
CharField.register_lookup(Length)
class WordMatcher(models.Model):
"""词典升级的时候,模糊匹配的词典也需要一并升级!!!!"""
spelling = models.CharField(max_length=255)
char_set = models.CharField(max_length=26)
def __str__(self):
return str([self.spelling,self.char_set])
Words词典
char_set字段的计算(数据库的产生)
wob = Word.objects
class UpdateWordMatcher:
def update(self):
sub_dict_set = wob.all()[:2]
for item in sub_dict_set:
char_set = set(item.spelling)
chars = list(char_set)
chars.sort()
chars_str = "".join(chars)
print(chars_str)
d = {"spelling": item.spelling, "char_set": chars_str}
wmob.create(**d)
序列化器
class WordMatcherModelSerializer(ModelSerializer):
class Meta:
model=WordMatcher
fields = "__all__"
参考代码
Serialzier部分是使用了Django_DRF框架的序列化器
class WordMatcherViewSet(ModelViewSet):
""" 模糊匹配数据库"""
wmob = WordMatcher.objects
queryset = wmob.all()
serializer_class = WordMatcherModelSerializer
def fuzzy_match(self, req, spelling, start_with=0):
"""
:param req:
:type req:
:param spelling:
:type spelling:
:param start_with:匹配开头的字符串长度 (default: {0},表示没有被强制规定)
:type start_with:
:return:
:rtype: Response
"""
spelling_len = len(spelling)
if(start_with==0):
if(spelling_len>4):
start_with=2
else:
start_with=1
spelling_char_set = set(spelling)
spelling_char_list = list(spelling_char_set)
spelling_char_list.sort()
spelling_char_set_str = "".join(spelling_char_set)
spelling_char_set_len = len(spelling_char_set)
left_len = spelling_len * 0.70
right_len = spelling_len * 1.4
if spelling_len >= 4:
right_len = spelling_len * 2
"""限制单词长度"""
queryset = self.queryset.filter(spelling__length__gte=left_len) & self.queryset.filter(
spelling__length__lte=right_len)
queryset = queryset.filter(spelling__startswith=spelling[:start_with])
"""限制单词字符集规模的差异"""
queryset = queryset.filter(char_set__length__lte=1.25 * spelling_char_set_len)
queryset = queryset.filter(char_set__length__gte=0.6 * spelling_char_set_len)
"""匹配字符组成(最后一步)"""
items = []
for item in queryset:
item_char_set_len = len(item.char_set)
item_spelling_len = len(item.spelling)
intersection = set(item.char_set) & set(spelling_char_set)
intersection_len = len(intersection)
if (spelling_len >= 5):
if (intersection_len >= spelling_char_set_len * 0.8 and intersection_len >= item_char_set_len * 0.8):
items.append(item)
elif (intersection == spelling_char_set):
print("@intersection", intersection)
print("@spelling_char_set", spelling_char_set)
print(item, intersection, spelling_char_set_len)
if (item_spelling_len == spelling_len):
items.append(item)
items.sort(key=lambda x:x.spelling)
print(len(items))
return Res(self.serializer_class(instance=items, many=True).data)
def fuzzy_match_simple(self, req, spelling):
return self.fuzzy_match(req, spelling)
路由
path('fuzzy/<str:spelling>/', views.WordMatcherViewSet.as_view({
"get": "fuzzy_match_simple"
})),
path('fuzzy/<str:spelling>/<int:start_with>/', views.WordMatcherViewSet.as_view({
"get": "fuzzy_match"
})),
api基本效果
eg0:
eg1:
eg2
GEThttp://127.0.0.1:8000/word/fuzzy/fhather/1
项目地址
|