Google Code JamThis years' CodeJam qualification round started with a problem called Alien Language which is essentially a pattern matching exercise. The alien language has D words, each of length L, with all of these known in advance. You are given a bunch of regex-like word patterns and you need to report on the number of matches that each word pattern can produce.

The approach I took was to build a list of indices of the possible words that match the given pattern and prune this list as we progress through the pattern from left to right. For each token in the pattern we only need to look for matches in the list of remaining possible word indexes, as this gets more selective as we go. Although unnecessary in this case, a further optimization would be to start with the tokens that consist of only a single letter as they are likely to be more restrictive. It's the same sort of approach a database engine would take when evaluating a multi-clause where predicate (do the one that is likely to be the most restrictive first). The worst case run time complexity of this approach is O(D.L) which is good enough when D=5000 and L=15.

Alternately, if you are handy with regex syntax and are confident your language has an efficient regex implementation you can resort to using that.

The C# code is shown below:

using System;
using System.Collections.Generic;
using System.IO;

namespace GoogleCodeJam.AlienLanguage
{
    class Program
    {
        static void Main()
        {
            const string basePath = @"D:\";
            var infile = new StreamReader(basePath + "large.txt");
            var outfile = new StreamWriter(basePath + "output.txt");

            string[] input = (infile.ReadLine()).Split(new[] { ' ' }, StringSplitOptions.None);
            int L = Int32.Parse(input[0]);  // length of each word
            int D = Int32.Parse(input[1]);  // number of words
            int N = Int32.Parse(input[2]);  // number of test cases

            var words = new List<string>();
            for (int i = 0; i < D; i++)
                words.Add(infile.ReadLine().Trim());

            for (int caseNo = 1; caseNo <= N; caseNo++)
            {
                var wordIndicies = new List<int>();
                var removalList = new List<int>();
                for (int i = 0; i < D; i++)
                    wordIndicies.Add(i);

                var tokens = GetTokens(infile.ReadLine());
                for (int i = 0; i < L; i++)
                {
                    foreach (var j in wordIndicies)
                        if (!tokens[i].Contains(words[j].Substring(i, 1)))
                            removalList.Add(j);
                    foreach (var k in removalList)
                        wordIndicies.Remove(k);
                }
                outfile.WriteLine(String.Format("Case #{0}: {1}", caseNo, wordIndicies.Count));
            }
            infile.Close();
            outfile.Close();
        }

        public static List<string> GetTokens(string input)
        {
            var results = new List<string>();
            bool inToken = false;
            string currentToken = "";
            foreach(char c in input)
            {
                if (c == '(')
                {
                    currentToken = "";
                    inToken = true;
                }
                else if (c == ')')
                {
                    results.Add(currentToken);
                    inToken = false;
                }
                else
                {
                    if (inToken)
                        currentToken = currentToken + c;
                    else
                        results.Add(c.ToString());
                }
            }
            return results;
        }
    }
}

For completeness I also hacked together a solution in C++. Working with lists and the STL is somewhat painful so I've resorted to inelegant pre-allocated arrays (yuck!) and a large lookup map for the token matching, but hey, it solves the problem.

#include <string>
#include <list>
#include <iostream>

#define maxL 15
#define maxD 5000

using namespace std;

int L,D,N;
char words[maxD][maxL];
char tokenMap[maxL][26];
char input[28*maxL];

/* Builds a boolean matrix of flags indicating if the i-th token in the 
   pattern matches the j-th lowercase alphabet character */
void parseTokens()
{
    memset(tokenMap, 0, sizeof(tokenMap));
    int i,j = 0;
    for (i = 0; i < L; i++) {
        if (islower(input[j])) {
            tokenMap[i][input[j] - 'a'] = 1;
            j++;
        }
        else {
            j++;
            while(input[j] != ')') {
                tokenMap[i][input[j] - 'a'] = 1;
                j++;
            }
            j++;
        }
    }
}

void main()
{
    string basePath = "";
    string inFile = basePath + "large.txt";
    string outFile = basePath + "output.txt";

    freopen(inFile.c_str(),"r",stdin);
    freopen(outFile.c_str(),"w",stdout);

    cin >> L >> D >> N;

    for (int i=0; i<D; i++)
        cin >> words[i];

    for (int caseNo = 1; caseNo <= N; caseNo++)
    {
        cin >> input;
        parseTokens();

        int j,ans=0;
        for(int i=0; i<D; i++)    //for each word in the dictionary
        {
            for (j=0; j<L; j++)    //foreach char of the current word
            {
                if (!tokenMap[j][words[i][j] - 'a'])
                    break;
            }
            if (j == L) ans++;
        }
         cout << "Case #" << caseNo << ": " << ans <<endl;
    }
}
Bookmark and Share