字串比對

December 1, 2021

高階程式語言對於字串比對都有直接支援,不過仍是個值得探討的課題,傳統的字串比對是從關鍵字與字串的開頭開始比對,例如 Knuth-Morris-Pratt 演算法字串比對,這個方法也不錯,不過要花時間在公式計算上,這邊說明 Boyer-Moore 方法。

解法思路

Boyer-Moore 字串比對由關鍵字的後面開始核對字串,並製作前進表,如果比對不符合,依前進表中的值前進至下個核對處,假設是 p 好了,然後比對字串中 p - n + 1 至 p 的值是否與關鍵字相同。

字串比對

舉個實際的例子來說明前進表的製作,若要在字串比對 “JUST” 字串,可能遇到的幾個情況如下:

字串比對

依照這個例子,可以決定出前進值表如下:

其他 J U S T
4 3 2 1 4(match?)

如果關鍵字中有重複的字元,前進值就會有兩個以上的值,此時則取前進值較小的值,如此就不會跳過可能的位置,例如 “texture” 這個關鍵字,’t' 的前進值應該取後面的 3 而不是取前面的 7。

程式實作

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#define SKIP_TABLE_SIZE 256
#define STRING_LENGTH 80

void table(int*, char*);  // 建立前進表 
int indexOf(int*, int, char*, char*); // 搜尋關鍵字 
void subString(char*, char*, int, int); // 取出子字串 

int main(void) { 
    int skip[SKIP_TABLE_SIZE];
    
    char input[STRING_LENGTH]; 
    char key[STRING_LENGTH]; 
    char sub[STRING_LENGTH] = {'\0'}; 
    
    printf("字串:"); 
    gets(input); 
    printf("關鍵字:"); 
    gets(key); 

    table(skip, key);
    
    int m = strlen(input); // 計算字串長度 
    int n = strlen(key); 
    int p = indexOf(skip, n - 1, input, key); 

    while(p != -1) { 
        subString(input, sub, p, m); 
        printf("%s\n", sub); 
        p = indexOf(skip, p + n + 1, input, key); 
    } 
    
    return 0; 
} 

void table(int* skip, char *key) { 
    int n = strlen(key); 
    int k;
    for(k = 0; k < SKIP_TABLE_SIZE; k++) {
        skip[k] = n; 
    }
    for(k = 0; k < n - 1; k++) {
        skip[key[k]] = n - k - 1; 
    }
} 

int indexOf(int* skip, int from, char* str, char* key) { 
    char sub[STRING_LENGTH] = {'\0'}; 

    int strLen = strlen(str); 
    int keyLen = strlen(key); 
    int index = from;
    
    while(index < strLen) { 
        subString(str, sub, index - keyLen + 1, index); 
        if(!strcmp(sub, key)) { // 比較兩字串是否相同 
           return index - keyLen + 1; 
        }
        index += skip[str[index]]; 
    } 

    return -1; 
} 

void subString(char *text, char* sub, int s, int e) { 
    int i, j; 
    for(i = s, j = 0; i <= e; i++, j++) {
        sub[j] = text[i]; 
    }
    sub[j] = '\0'; 
}  
import java.util.*;
import java.io.*;
import static java.lang.System.*;

public class StringMatcher implements Iterable<String> {
    private String str;
    private String key;
    private int[] skip = new int[256];

    public StringMatcher(String str, String key) {
        this.str = str;        
        this.key = key;
        Arrays.fill(skip, key.length());
        for(int k = 0; k < key.length() - 1; k++) {
            skip[key.charAt(k)] = key.length() - k - 1;
        }
    }
    
    public Iterator<String> iterator() { return new Itr(); }

    private class Itr implements Iterator<String> {
        private int index;
        
        { index = indexOf(key.length() - 1, str, key); }
        
        private int indexOf(int from, String str, String key) {
            int tp = from;
            while(tp < str.length() && 
                  ! str.substring(tp - key.length() + 1, tp + 1)
                       .equals(key)) {
                tp += skip[str.charAt(tp)];
            }
            return tp - key.length() + 1;
        }

        public boolean hasNext() {
            return index < str.length() - 1;
        }

        public String next() {
            String sub = str.substring(index);
            index = indexOf(index + key.length() + 1, str, key);
            return sub;
        }

        public void remove() { throw new RuntimeException("Not supported"); }
    }
    
    public static void main(String[] args) {
        Scanner scanner = new Scanner(in);

        out.print("請輸入字串:");
        String str = scanner.nextLine();
        out.print("請輸入搜尋關鍵字:");
        String key = scanner.nextLine();

        for(String s : new StringMatcher(str, key)) {
            out.println(s);
        }
    }
}
def matcher(str, key):
    strLen = len(str)
    keyLen = len(key)
    
    skip = [keyLen - key.rindex(chr(k)) - 1 
            if(chr(k) in key[:-1]) else keyLen for k in range(256)]
            
    def next(i):
        return (next(i + skip[ord(str[i])]) 
            if(i < strLen and str[i - keyLen + 1: i + 1] != key) 
            else i - keyLen + 1)
    
    def match(i):
        nextI = next(i + keyLen + 1)
        return [str[i:]] + (match(nextI) if(nextI < strLen - 1) else [])
  
    return match(next(keyLen))
    
for s in matcher(input("字串:"), input("關鍵字:")):
    print(s)
def matcher(str: String, key: String) = {
    val skip = for(k <- 0 until 256) yield 
          if(key.init.contains(k.toChar)) 
              key.length - key.lastIndexOf(k.toChar) - 1 
          else key.length

    def next(i: Int): Int = {
        if(i < str.length && 
           !str.substring(i - key.length + 1, i + 1).equals(key))
            next(i + skip(str.charAt(i).toInt))
        else i - key.length + 1
    }
    
    def find(i: Int): List[String] = {
        val nextI = next(i + key.length + 1)
        str.substring(i) :: (if(nextI < str.length - 1) find(nextI) else Nil)
    }
    
    find(next(key.length))
}

print("字串:")
val str = readLine
print("關鍵字:")
val key = readLine
matcher(str, key).foreach(println)
# encoding: UTF-8
def matcher(str, key)
    skip = (0...256).map { |k|
        if key[0...-1].include? k.chr
            key.size - key.rindex(k.chr) - 1
        else
            key.size
        end
    }
    
    nextIndex = ->(i) {
        if i < str.size and str[i - key.size + 1 ... i + 1] != key
            nextIndex.call(i + skip[str[i].ord])
        else
            i - key.size + 1
        end
    }
    
    match = ->(i) {
        nextI = nextIndex.call(i + key.size + 1)
        [str[i..-1]] + if nextI < str.size - 1; match.call(nextI) else [] end
    }
              
    return match.call(nextIndex.call(key.size))
end

print "字串:"
str = gets.chomp
print "關鍵字:"
key = gets.chomp

matcher(str, key).each do |s|
    puts s
end
var matcher = function() {
    function range(n) {
        var r = [];
        for(var i = 0; i < n; i++) { r[i] = i; }
        return r;
    }

    return function(str, key) {
        var skip = range(256).map(function(k) {
             return key.slice(0, -1)
                       .indexOf(String.fromCharCode(k)) !== -1 ? 
                    key.length - key.lastIndexOf(String.fromCharCode(k)) - 1 :
                    key.length;
        });
        
        function next(i) {
            return i < str.length && 
                   str.slice(i - key.length + 1, i + 1) !== key ?
                   next(i + skip[str.charCodeAt(i)]) :
                   i - key.length + 1
        }
    
        function match(i) {
            var nextI = next(i + key.length + 1);
            return [str.slice(i)].concat(
                     nextI < str.length - 1 ? match(nextI) : []);
        }
        
        return match(next(key.length))
    };
}();

matcher('This is a test', 'is').forEach(function(s) {
    print(s);
});
import Data.Char
import Data.List

rindex elem list = 
    let (Just a) = elem `elemIndex` (reverse list)
    in length list - a - 1

matcher str key =  find $ next keyLen
    where strLen = length str
          keyLen = length key
          skip = [if (chr k) `elem` (init key) then 
                      keyLen - rindex (chr k) key - 1 
                  else keyLen | k <- [0..255]]
          next i = if i < strLen && 
                       (drop (i - keyLen + 1) . take (i + 1) $ str) /= key 
                       then next (i + skip !! (ord (str !! i))) 
                   else i - keyLen + 1
          find i = let nextI = next $ i + length key + 1
                   in (drop i str) : if nextI < strLen - 1 then find nextI 
                                     else []

main = do
    putStrLn "String..."
    str <- getLine
    putStrLn "Keyword..."
    key <- getLine
    sequence [putStrLn s | s <- matcher str key]
# 我寫的玩具語言 https://github.com/JustinSDK/toy_lang

SKIP_TABLE_SIZE = 256

def table(skip, key) { 
    n = key.length()
    iterate(0, SKIP_TABLE_SIZE).forEach(k -> skip.set(k, n))
    iterate(0, n - 1).forEach(k -> skip.set(key.charCodeAt(k), n - k - 1))
} 

def indexOf(skip, from, text, key) { 
    index = from
    while index < text.length() {
        if text.substring(index - key.length() + 1, index + 1) == key {
            return index - key.length() + 1
        }
        index += skip.get(text.charCodeAt(index))
    }
    return -1 
} 

text = input('字串:')
key = input('關鍵字:')

skip = range(0, SKIP_TABLE_SIZE)
table(skip, key)

p = indexOf(skip, key.length() - 1, text, key)
while p != -1 {
    println(text.substring(p, text.length()))
    p = indexOf(skip, p + key.length() + 1, text, key)
}