Rabin-Karp Algorithm

Generated on 2025-07-10 02:08:07 Algorithm Cheatsheet for Technical Interviews

Rabin-Karp Algorithm Cheatsheet

1. Quick Overview

What it is: A string searching algorithm that uses hashing to find occurrences of a pattern string within a larger text. It efficiently reduces the number of character comparisons by comparing hash values of substrings instead of the substrings themselves.

When to use it:

Searching for multiple patterns in a text.
When the pattern size is relatively small compared to the text size.
When a fast average-case performance is desired, even if worst-case is possible.
Suitable for rolling hash implementations where the hash of the next substring can be computed from the current one in O(1) time.

2. Time & Space Complexity

Operation	Time Complexity	Space Complexity
Preprocessing	O(m)	O(1)
Searching	O(n) (Avg/Best)	O(1)
Searching	O(nm) (Worst)	O(1)
Overall (Avg)	O(n + m)	O(1)
Overall (Worst)	O(nm)	O(1)

n: Length of the text string
m: Length of the pattern string

Worst-Case Scenario: Occurs when the hash function produces many collisions (different substrings having the same hash value). This can lead to O(nm) time complexity because we have to compare the strings character by character for each potential match.

3. Implementation

Here are examples in Python, Java, and C++:

Python:

def rabin_karp(text, pattern, prime=101):
    """
    Finds all occurrences of pattern in text using Rabin-Karp algorithm.

    Args:
        text: The text string to search within.
        pattern: The pattern string to search for.
        prime: A prime number used for hashing (helps reduce collisions).

    Returns:
        A list of starting indices where the pattern is found in the text.
    """
    n = len(text)
    m = len(pattern)
    if m > n:
        return []

    pattern_hash = 0
    text_hash = 0
    h = 1  # h = pow(prime, m-1) % prime

    for i in range(m - 1):
        h = (h * prime)  # Calculate h = prime^(m-1)

    # Calculate initial hash values for pattern and first substring of text
    for i in range(m):
        pattern_hash = (prime * pattern_hash + ord(pattern[i]))
        text_hash = (prime * text_hash + ord(text[i]))

    indices = []

    # Slide the pattern over text one by one
    for i in range(n - m + 1):
        # Check the hash values of current window of text and pattern
        if pattern_hash == text_hash:
            # If the hash values match, then check for actual match
            if pattern == text[i:i + m]:
                indices.append(i)

        # Calculate hash value for next window of text: Remove leading digit, add trailing digit
        if i < n - m:
            text_hash = (prime * (text_hash - ord(text[i]) * h) + ord(text[i + m]))

    return indices

# Example Usage
text = "ABABDABACDABABCABAB"
pattern = "ABABCABAB"
result = rabin_karp(text, pattern)
print(f"Pattern found at indices: {result}") # Output: Pattern found at indices: [10]

Java:

public class RabinKarp {

    public static void rabinKarp(String text, String pattern, int prime) {
        int n = text.length();
        int m = pattern.length();
        if (m > n) {
            return;
        }

        long patternHash = 0;
        long textHash = 0;
        long h = 1;

        // Calculate h = prime^(m-1)
        for (int i = 0; i < m - 1; i++) {
            h = (h * prime);
        }

        // Calculate initial hash values for pattern and first substring of text
        for (int i = 0; i < m; i++) {
            patternHash = (prime * patternHash + pattern.charAt(i));
            textHash = (prime * textHash + text.charAt(i));
        }

        // Slide the pattern over text one by one
        for (int i = 0; i <= n - m; i++) {
            // Check the hash values of current window of text and pattern
            if (patternHash == textHash) {
                // If the hash values match, then check for actual match
                if (pattern.equals(text.substring(i, i + m))) {
                    System.out.println("Pattern found at index " + i);
                }
            }

            // Calculate hash value for next window of text: Remove leading digit, add trailing digit
            if (i < n - m) {
                textHash = (prime * (textHash - text.charAt(i) * h) + text.charAt(i + m));
            }
        }
    }

    public static void main(String[] args) {
        String text = "ABABDABACDABABCABAB";
        String pattern = "ABABCABAB";
        int prime = 101;
        rabinKarp(text, pattern, prime); // Output: Pattern found at index 10
    }
}

C++:

#include <iostream>
#include <string>
#include <vector>

using namespace std;

vector<int> rabinKarp(const string& text, const string& pattern, int prime = 101) {
    int n = text.length();
    int m = pattern.length();
    if (m > n) {
        return {};
    }

    long long patternHash = 0;
    long long textHash = 0;
    long long h = 1;

    // Calculate h = prime^(m-1)
    for (int i = 0; i < m - 1; i++) {
        h = (h * prime);
    }

    // Calculate initial hash values for pattern and first substring of text
    for (int i = 0; i < m; i++) {
        patternHash = (prime * patternHash + pattern[i]);
        textHash = (prime * textHash + text[i]);
    }

    vector<int> indices;

    // Slide the pattern over text one by one
    for (int i = 0; i <= n - m; i++) {
        // Check the hash values of current window of text and pattern
        if (patternHash == textHash) {
            // If the hash values match, then check for actual match
            if (pattern == text.substr(i, m)) {
                indices.push_back(i);
            }
        }

        // Calculate hash value for next window of text: Remove leading digit, add trailing digit
        if (i < n - m) {
            textHash = (prime * (textHash - text[i] * h) + text[i + m]);
        }
    }

    return indices;
}

int main() {
    string text = "ABABDABACDABABCABAB";
    string pattern = "ABABCABAB";
    vector<int> result = rabinKarp(text, pattern);

    cout << "Pattern found at indices: ";
    for (int index : result) {
        cout << index << " ";
    }
    cout << endl; // Output: Pattern found at indices: 10
    return 0;
}

4. Common Patterns

Multiple Pattern Search: Extend the algorithm to search for multiple patterns simultaneously by storing the hash values of all patterns in a set or hash table.
Approximate String Matching: Modify the algorithm to allow for a certain number of mismatches between the pattern and the text. This can be done by comparing the hash values and then performing a more detailed comparison only for substrings with similar hash values.
Rolling Hash: The core technique used in Rabin-Karp. It allows for efficient calculation of the hash value of the next substring by using the hash value of the current substring. This is critical for achieving O(n) average-case time complexity.

5. Pro Tips

Prime Number Selection: Choose a large prime number to minimize collisions. Larger primes reduce the likelihood of different strings hashing to the same value.
Modulo Arithmetic: Use modulo operator (%) to prevent integer overflow, especially when dealing with large prime numbers and long strings.
Spurious Hits: Handle spurious hits (hash match but string mismatch) efficiently. Always perform a character-by-character comparison when hash values match.
Hash Function Choice: The quality of the hash function significantly impacts performance. A good hash function distributes strings evenly across the hash space, minimizing collisions. The polynomial hash function (used in the examples) is a common choice.
Precomputation: Precompute h = prime^(m-1) outside the main loop for efficiency.

6. Practice Problems

LeetCode 28. Find the Index of the First Occurrence in a String: Implement strStr() using Rabin-Karp.
LeetCode 187. Repeated DNA Sequences: Find all 10-letter-long sequences (substrings) that occur more than once in a DNA molecule. (Good for practicing rolling hash)
Custom Problem: Given a text and a list of patterns, find all occurrences of any of the patterns in the text.

7. Templates

C++ Template:

#include <iostream>
#include <string>
#include <vector>

using namespace std;

vector<int> rabinKarpTemplate(const string& text, const string& pattern, int prime = 101) {
    int n = text.length();
    int m = pattern.length();
    if (m > n) {
        return {};
    }

    long long patternHash = 0;
    long long textHash = 0;
    long long h = 1;

    // Calculate h = prime^(m-1)
    for (int i = 0; i < m - 1; i++) {
        h = (h * prime);
    }

    // Calculate initial hash values for pattern and first substring of text
    for (int i = 0; i < m; i++) {
        patternHash = (prime * patternHash + pattern[i]);
        textHash = (prime * textHash + text[i]);
    }

    vector<int> indices;

    // Slide the pattern over text one by one
    for (int i = 0; i <= n - m; i++) {
        // Check the hash values of current window of text and pattern
        if (patternHash == textHash) {
            // If the hash values match, then check for actual match
            if (pattern == text.substr(i, m)) {
                indices.push_back(i);
            }
        }

        // Calculate hash value for next window of text: Remove leading digit, add trailing digit
        if (i < n - m) {
            textHash = (prime * (textHash - text[i] * h) + text[i + m]);
        }
    }

    return indices;
}

Python Template:

def rabin_karp_template(text, pattern, prime=101):
    n = len(text)
    m = len(pattern)
    if m > n:
        return []

    pattern_hash = 0
    text_hash = 0
    h = 1

    for i in range(m - 1):
        h = (h * prime)

    for i in range(m):
        pattern_hash = (prime * pattern_hash + ord(pattern[i]))
        text_hash = (prime * text_hash + ord(text[i]))

    indices = []

    for i in range(n - m + 1):
        if pattern_hash == text_hash:
            if pattern == text[i:i + m]:
                indices.append(i)

        if i < n - m:
            text_hash = (prime * (text_hash - ord(text[i]) * h) + ord(text[i + m]))

    return indices

Java Template:

public class RabinKarpTemplate {

    public static void rabinKarpTemplate(String text, String pattern, int prime) {
        int n = text.length();
        int m = pattern.length();
        if (m > n) {
            return;
        }

        long patternHash = 0;
        long textHash = 0;
        long h = 1;

        for (int i = 0; i < m - 1; i++) {
            h = (h * prime);
        }

        for (int i = 0; i < m; i++) {
            patternHash = (prime * patternHash + pattern.charAt(i));
            textHash = (prime * textHash + text.charAt(i));
        }

        for (int i = 0; i <= n - m; i++) {
            if (patternHash == textHash) {
                if (pattern.equals(text.substring(i, i + m))) {
                    System.out.println("Pattern found at index " + i);
                }
            }

            if (i < n - m) {
                textHash = (prime * (textHash - text.charAt(i) * h) + text.charAt(i + m));
            }
        }
    }
}