代码所演示的算法就是参考文献1中的“Quick Search”算法,它实际是由BM算法改进而来,只使用了“坏字符启发”策略,其它时候按“Strict Forward”算法处理,实际运算的速度相当快。
这里对上述算法进行了一个很小的改动,加入了对中英文混合,也即单字节词与双字节词混合的支持。在使用原算法找出疑似匹配串后,再确定一下是否是一个词(可以是单字节词,也可以是双字节词)的起点,如果不是,则跳过匹配串,从下一个词开始重新匹配。
源代码是由纯汇编写的,如下:
;改进型的bm匹配算法 支持中英文混合(即单字节与双字节词混合;用nasm编译;函数原型;extern "C" bool sfstrstr(unsigned int td[256], const char* str1, const char* str2, char case_table[256]);;其中:; td内的所有元素需在最初被初始化为0; casetable是一个大小写转换用的转换表,加快大小写转换的速度,需用如下的方法生成; for(int i = 0; i < 256; ++i) ; casetable[i] = tolower(i);;;如果匹配成功返回true(1), 否则返回false(0);;xieyubo@gmail.com;;Last Modified: 2005.10.15 [BITS 32][global sfstrstr] countstrlen: xor eax, eax _countstrlen_loop: cmp [esi + eax], byte 0 je _countstrlen_exit inc eax jmp _countstrlen_loop _countstrlen_exit: ret lower: mov al, [esp + 4] and eax, 0xff add eax, [ebp + 20] mov al, [eax] mov [esp + 4], al ret getnumcnword: xor eax, eax xor ebx, ebx _getnumcnword_loop: cmp ebx, [ebp - 8] je _getnumcnword_exit mov edx, [ebp + 12] add edx, ebx test [edx], byte 0x80 jz _getnumcnword_loop_goto inc eax _getnumcnword_loop_goto: inc ebx jmp _getnumcnword_loop _getnumcnword_exit: ret inittd: mov esi, [ebp + 16] mov edi, [ebp + 8] xor ecx, ecx _inittd_loop: mov eax, [ebp - 12] cmp ecx, eax jnl _inittd_exit mov edx, eax sub edx, ecx push dword [esi + ecx] call lower pop eax and eax, 0xff mov [edi + eax * 4], edx inc ecx jmp _inittd_loop _inittd_exit: ret resettd: mov esi, [ebp + 16] mov edi, [ebp + 8] xor ecx, ecx _resettd_loop: cmp ecx, [ebp - 12] jnl _resettd_exit push dword [esi + ecx] call lower pop eax and eax, 0xff mov [edi + eax * 4], dword 0 inc ecx jmp _resettd_loop _resettd_exit: ret sfstrstr: push ebp mov ebp, esp sub esp, 20 push esi push edi ;;strlen(str1) mov esi, [ebp + 12] call countstrlen mov [ebp - 16], eax ;;strlen(str2) mov esi, [ebp + 16] call countstrlen mov [ebp - 12], eax call inittd mov esi, [ebp + 16] mov edi, [ebp + 12] mov [ebp - 4], dword 0 mov [ebp - 8], dword 0 _sfstrstr_loop2: xor ecx, ecx mov eax, [ebp - 8] add eax, [ebp - 12] cmp eax, [ebp - 16] jg _sfstrstr_loop2_exit _sfstrstr_loop3: cmp ecx, [ebp - 12] jnl _sfstrstr_loop3_exit mov eax, [ebp - 8] add eax, ecx push dword [edi + eax] call lower pop ebx push dword [esi + ecx] call lower pop edx cmp bl, dl jne _sfstrstr_loop3_exit inc ecx jmp _sfstrstr_loop3 _sfstrstr_loop3_exit: cmp ecx, [ebp - 12] jne _sfstrstr_goto1 call getnumcnword test eax, 0x00000001 jz _sfstrstr_goto3 mov eax, [ebp - 12] inc eax add [ebp - 8], eax jmp _sfstrstr_loop2 _sfstrstr_goto3: mov [ebp - 4], dword 1 jmp _sfstrstr_loop2_exit _sfstrstr_goto1: mov eax, [ebp - 8] add eax, [ebp - 12] push dword [edi + eax] call lower pop eax and eax, 0xff shl eax, 2 add eax, [ebp + 8] mov eax, [eax] cmp eax, 0 je _sfstrstr_goto2 add [ebp - 8], eax jmp _sfstrstr_loop2 _sfstrstr_goto2: mov eax, [ebp - 12] inc eax add [ebp - 8], eax jmp _sfstrstr_loop2 _sfstrstr_loop2_exit: call resettd mov eax, [ebp - 4] return: pop edi pop esi mov esp, ebp pop ebp ret
联系客服