This section of the archives stores flipcode's complete Developer Toolbox collection, featuring a variety of mini-articles and source code contributions from our readers.

 

  CleanLanguage Class With Fuzzy Compare
  Submitted by John W. Ratcliff



This code snippet will take any stream of ASCII input in the English language and search for objectionable language. If offending language is found, then the original input will be replaced with non-offensive alternatives.

This code snippet demonstrates the use of STL and STL strings, a basic parser, some simple encryption, and a very nice fuzzy string comparison routine which is contained in Gestalt.cpp and Gestalt.h

The files contained in this snippet are as follows:

  • cleanl.h : The CleanLanguage class header file.
  • cleanl.cpp : The CleanLanguage class implementation.
  • rand.h : A random number class.
  • translate.h : An alphabetic translation class.
  • gestalt.h : The Gestalt class header file providing a fuzzy compare
  • gestalt.cpp : The Gestalt class implementation.
  • test.cpp : A Test application demonstrating the usa


  • Download the zipped source package here (29k)

    Currently browsing [fuzzycompare.zip] (29,306 bytes) - [cleanl.cpp] - (11,673 bytes)

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <assert.h>

    #include "translate.h" // alphabet translation tables #include "cleanl.h" // header file for clean language class #include "gestalt.h"

    // Database. static const unsigned char data[] = { 224,105,247,13,163,30,149,0,90,178,30,149,30,150,37,0,224,105, 247,13,163,54,195,180,0,103,71,205,30,190,37,163,54,195,180,0, 224,105,247,13,163,0,190,37,163,30,8,103,71,205,30,0,224,150, 178,54,90,0,150,90,247,105,105,0,224,184,247,150,150,222,0,163, 54,90,90,30,195,0,224,84,247,103,103,150,178,54,90,0,69,71, 71,8,69,71,71,0,224,190,71,90,178,30,149,105,247,13,163,30, 149,0,105,37,190,54,103,222,8,180,247,222,0,224,37,150,150,178, 71,103,30,0,84,71,43,71,0,224,190,71,90,178,30,149,105,247, 13,163,54,195,180,0,105,247,195,8,150,30,30,163,54,195,180,0, 224,84,54,90,13,178,0,105,30,190,37,103,30,8,69,71,180,0, 224,13,71,13,163,150,247,13,163,30,149,0,184,103,30,37,150,247, 149,30,8,180,54,205,30,149,0,224,105,247,13,163,30,69,0,184, 30,195,30,90,149,37,90,30,69,0,224,105,247,13,163,30,69,0, 54,195,8,149,30,37,103,103,222,8,69,30,30,184,8,90,149,71, 247,84,103,30,0,224,37,150,150,178,71,103,30,0,84,54,180,8, 69,71,195,163,30,222,0,224,84,54,90,13,178,0,195,54,13,30, 8,180,247,222,0,224,105,247,163,0,150,178,71,205,30,0,224,105, 247,13,0,150,178,71,205,30,0,224,195,54,180,180,30,149,0,180, 149,30,37,90,8,180,247,222,0,224,184,247,150,150,222,0,195,54, 13,30,8,180,247,222,0,224,150,178,54,90,0,13,149,37,184,0, 224,150,178,222,90,0,13,149,37,184,0,224,195,30,180,30,149,0, 195,54,13,30,8,180,247,222,0,184,149,54,13,163,0,150,178,37, 149,184,8,184,37,54,195,0,178,30,103,103,0,178,30,13,163,0, 69,37,190,195,0,69,37,149,195,0,13,247,195,90,0,163,54,90, 90,222,0,184,54,150,150,0,128,71,151,244,0,150,178,54,90,178, 30,37,69,0,184,54,195,178,30,37,69,0,180,71,69,69,37,190, 195,30,69,0,180,71,150,178,8,69,37,149,195,0,84,54,90,13, 178,54,195,180,0,200,178,54,195,54,195,180,0,84,37,150,90,37, 149,69,0,105,37,90,178,30,149,103,30,150,150,0,37,150,150,0, 69,71,195,163,30,222,0,13,103,54,90,0,90,71,222,0,150,247, 13,163,150,0,54,150,8,103,37,190,30,0,13,71,13,163,0,149, 71,71,150,90,30,149,0,69,54,13,163,0,134,54,13,178,37,149, 69,0,69,54,13,163,178,30,37,69,0,134,54,13,178,37,149,69, 155,150,8,195,71,180,180,54,195,0,13,247,190,0,103,54,22,247, 54,69,0,84,103,71,200,239,71,84,0,54,195,105,103,37,90,54, 71,195,37,149,222,8,30,190,184,103,71,222,190,30,195,90,0,90, 54,90,0,84,71,71,84,54,30,0,90,54,90,150,0,184,54,103, 103,71,200,150,0,84,37,103,103,150,0,84,149,37,205,30,149,222, 0,205,37,180,54,195,37,0,103,37,149,180,30,8,71,184,30,195, 54,195,180,0,184,30,195,54,150,0,69,71,103,184,178,54,195,0, 71,149,180,37,190,150,0,71,149,54,180,37,190,54,0,84,149,30, 37,150,90,150,0,13,178,54,13,163,30,195,150,0,84,149,30,37, 150,90,0,150,71,105,90,84,37,103,103,0,30,13,150,90,37,13, 222,0,178,37,184,184,222,8,69,37,222,150,0,195,54,184,184,103, 30,0,184,37,13,54,105,54,30,149,0,150,247,13,163,103,30,69, 0,69,149,37,195,163,0,190,37,195,178,71,71,69,0,90,178,30, 8,84,54,180,8,71,195,30,0,84,54,71,90,13,178,0,195,54, 13,30,8,180,247,222,0,84,54,37,90,13,178,0,195,54,13,30, 8,180,247,222,0,84,247,195,180,0,195,71,150,30,0,13,178,54, 180,37,84,71,71,0,195,54,13,30,8,180,247,222,0,13,178,54, 195,13,0,195,54,13,30,8,180,247,222,0,13,178,54,195,163,0, 195,54,13,30,8,180,247,222,0,13,103,54,90,0,84,54,180,8, 90,71,30,0,13,71,13,163,0,84,54,180,8,90,71,30,0,13, 247,190,0,150,184,37,190,0,150,247,13,163,0,84,149,30,37,90, 178,30,8,90,178,149,71,247,180,178,8,37,8,150,90,149,37,200, 0,13,247,195,195,54,103,54,195,180,247,150,0,195,71,150,30,8, 149,247,84,84,54,195,180,0,13,247,195,90,0,195,71,150,90,149, 54,103,0,69,54,13,163,0,84,54,180,8,90,71,30,0,69,54, 103,69,71,0,90,71,37,150,90,30,149,0,69,222,163,30,0,195, 54,13,30,8,180,247,222,0,105,37,180,0,195,54,13,30,8,180, 247,222,0,105,30,103,103,37,90,54,71,0,163,54,150,150,54,195, 180,0,180,71,69,69,0,180,71,103,103,222,0,239,37,13,163,71, 0,195,54,13,30,8,180,247,222,0,239,30,149,163,71,0,195,54, 13,30,8,180,247,222,0,239,30,149,163,0,195,54,13,30,8,180, 247,222,0,239,54,180,37,84,71,71,0,195,54,13,30,8,180,247, 222,0,239,54,150,190,0,150,184,37,190,0,239,54,43,0,150,184, 37,190,0,163,54,163,30,0,195,54,13,30,8,180,247,222,0,163, 103,54,90,0,195,71,150,90,149,54,103,0,163,247,195,90,0,195, 54,13,30,8,180,247,222,0,163,222,163,30,0,195,54,13,30,8, 180,247,222,0,103,30,150,84,71,0,195,54,13,30,8,180,247,222, 0,103,30,43,84,71,0,195,54,13,30,8,180,247,222,0,195,54, 180,180,0,195,54,13,30,8,180,247,222,0,184,30,195,54,150,0, 84,54,180,8,90,71,30,0,184,54,150,150,0,184,247,163,30,0, 184,178,247,0,150,178,71,205,30,0,184,149,54,13,163,0,195,54, 13,30,8,180,247,222,0,150,195,37,90,13,178,0,195,71,150,90, 149,54,103,0,150,184,30,149,190,0,239,30,103,103,71,0,150,184, 54,13,0,195,54,13,30,8,180,247,222,0,90,54,90,0,150,184, 37,149,149,71,200,0,90,200,37,90,0,195,71,150,90,149,54,103, 0,205,37,180,54,195,0,195,71,150,90,149,54,103,0,200,71,184, 0,195,54,13,30,8,180,247,222,0,37,149,150,13,178,0,69,71, 195,163,30,222,0,84,103,37,150,30,195,0,180,30,149,190,37,195, 8,200,71,149,69,0,84,247,190,150,30,195,0,180,30,149,190,37, 195,8,200,71,149,69,0,105,54,13,163,0,150,178,71,205,30,0, 105,54,13,163,30,195,0,180,30,149,190,37,195,8,200,71,149,69, 0,105,54,13,163,30,149,0,180,30,149,190,37,195,8,200,71,149, 69,0,105,247,90,43,30,0,180,30,149,190,37,195,8,200,71,149, 69,0,180,71,90,90,205,30,149,69,37,190,190,90,0,180,71,103, 103,222,0,178,54,90,103,30,149,0,150,190,247,149,105,0,178,247, 149,30,0,195,54,13,30,8,180,247,222,0,178,247,149,30,150,71, 178,195,0,195,54,13,30,8,180,247,222,0,54,90,37,163,30,149, 0,180,30,149,190,37,195,8,200,71,149,69,0,163,37,105,105,30, 149,0,180,30,149,190,37,195,8,200,71,149,69,0,163,103,54,90, 71,149,54,150,0,195,71,150,90,149,54,103,0,103,30,150,84,30, 0,195,54,13,30,8,180,247,222,0,195,37,43,54,0,195,54,13, 30,8,180,247,222,0,195,247,90,90,30,0,180,30,149,190,37,195, 8,200,71,149,69,0,184,54,30,190,30,103,0,180,30,149,190,37, 195,8,200,71,149,69,0,184,54,190,190,30,103,0,180,30,149,190, 37,195,8,200,71,149,69,0,184,54,150,150,30,0,184,247,163,30, 0,184,54,150,150,30,195,0,184,247,163,54,195,180,0,184,54,150, 150,30,149,0,195,54,13,30,8,180,247,222,0,184,71,103,37,163, 30,0,195,54,13,30,8,180,247,222,0,150,13,178,103,37,190,184, 30,0,180,30,149,190,37,195,8,200,71,149,69,0,150,13,178,200, 37,195,43,0,84,54,180,8,90,71,30,0,150,13,178,200,30,54, 195,0,195,54,13,30,8,180,247,222,0,150,13,178,200,247,13,178, 90,30,103,0,180,30,149,190,37,195,8,200,71,149,69,0,150,13, 178,200,30,103,0,180,30,149,190,37,195,8,200,71,149,69,0,150, 13,178,200,247,103,54,195,0,180,30,149,190,37,195,8,200,71,149, 69,0,150,184,30,149,190,37,0,150,184,37,190,0,90,54,90,90, 30,0,150,184,37,149,149,71,200,0,200,54,13,178,150,30,0,180, 30,149,190,37,195,8,200,71,149,69,0, };

    CleanLanguage::CleanLanguage(void) { mTranslate = new Translate(13); // build ascii translation table // Place contents of database into an STL vector of strings. const unsigned char *foo = data;

    while ( *foo ) { String str = (char *)foo; mWords.push_back(str); foo+=str.size()+1; // advance to next string. }

    // get translated wildcard character. mWildChar = mTranslate->TranslateIn( (unsigned char) '*' ); }

    CleanLanguage::~CleanLanguage(void) { delete mTranslate; }

    // True if alphabetic character bool CleanLanguage::IsAlpha(char c) { if ( c >= 'a' && c <= 'z' ) return true; if ( c >= 'A' && c <= 'Z' ) return true; return false; }

    // Add a character to cleaned up output. bool CleanLanguage::Add(char c,char *clean,int &index,int maxlen) { if ( index < (maxlen-1) ) { clean[index] = c; index++; clean[index] = 0; return true; } return false; }

    // will analyze an input string for foul language, and will replace // objectionable language with non-offending ascii text in the ouput // string. Returns count of the number of offending words found. int CleanLanguage::WashYourMouth(const String &input,String &output) const { int l = input.size(); if ( !l ) return 0;

    int dirtycount = 0;

    int maxlen = l*8; // maximum size output version could ever be. char *clean = new char[maxlen]; char *clean_alloc = clean;

    int wordlen = maxlen; if ( wordlen < 256 ) wordlen = 256;

    char *word = new char[wordlen];

    *clean = 0;

    int i = 0;

    const char *dirty = input.c_str();

    while ( *dirty ) {

    while ( !IsAlpha(*dirty) && *dirty ) { Add(*dirty,clean,i,maxlen); // add non-alphabetic characters. dirty++; }

    char *wordat = word;

    while ( IsAlpha(*dirty) && *dirty ) { *wordat++ = *dirty++; }

    *wordat = 0;

    // grab sequence of alphabetic characters and then analyze them. String wordin = word; String wordout;

    int match = CheckWord(wordin,wordout); if ( match ) dirtycount++;

    const char *foo = wordout.c_str();

    while ( *foo ) { Add(*foo++,clean,i,maxlen); }

    }

    output = clean_alloc;

    delete word; delete clean_alloc;

    return dirtycount; }

    // See if the word is 'bad' and return a cleaned up version in good. // Returns percentage match if any. int CleanLanguage::CheckWord(const String &bad,String &good) const { int size = bad.size();

    if ( size >= 255 ) // we don't process words more than 255 characters long { good = "too long"; return 100; // replaced! };

    int percent = 0; // best matching percentage. char temp[256]; // copy potential bad string and convert to new alphabet char match[256]; // translated result strcpy(temp,bad.c_str());

    bool uppercase = false; if ( *temp >= 'A' && *temp <= 'Z' ) uppercase = true; strlwr(temp);

    mTranslate->TranslateIn((unsigned char *)temp); // change alphabet StringVector::const_iterator i = mWords.begin();

    while ( i != mWords.end() ) {

    const unsigned char *word = (const unsigned char *)(*i).c_str();

    ++i;

    if ( *word == mWildChar ) // process as a wildcard! { if ( !percent && WildCard((const char *)&word[1],temp) ) { percent = 100; // perfect match to wildcard!! strcpy(match,(*i).c_str()); } } else { // Perform fuzzy compare between the two strings. int alike = Gestalt::FuzzyCompare(temp,(const char *)word); // Heuristic based on length of word and percentage match if ( ( size <= 4 && alike > 99 ) || ( size >= 5 && size < 7 && alike > 85 ) || ( size >= 7 && size < 10 && alike > 80 ) || ( size >= 10 && alike > 75 ) ) { if ( alike > percent ) // if best match so far. { strcpy(match,(*i).c_str()); percent = alike; } } } ++i; }

    if ( percent ) { mTranslate->TranslateOut((unsigned char *)match); if ( uppercase ) match[0]-=32; good = match; } else { good = bad; // nothing wrong with it. }

    return percent; }

    // checks to see if the sequence of characters in 'wild' are to be found anywhere // inside 'match' bool CleanLanguage::WildCard(const char *wild,const char *match) { int w = strlen(wild); int m = strlen(match);

    while ( w <= m ) { if ( strncmp(wild,match,w) == 0 ) return true; match++; m--; }

    return false; }

    Currently browsing [fuzzycompare.zip] (29,306 bytes) - [cleanl.h] - (2,253 bytes)

    #ifndef CLEANL_H

    #define CLEANL_H

    // The CleanLanguage class will parse any set of input ascii data // in the english language, look for objectionable words, and replace them // with non-objectionable words. It uses a fuzzy string compare routine to // detect inappropriate language even if it is not spelled exactly // correctly. Certainly users can find ways to defeat this parser, but // it has proven amusing and harmless in the past. The word lists and // intermediate strings are all converted into another alphabet, so you // shouldn't find yourself offended by the language in the code, or even // hackers dissassembling it for that matter. // // This code makes use of STL for String operations. // // OpenSourced July 25, 2000, by John W. Ratcliff (jratcliff@verant.com) // for FlipCode.com.

    #pragma warning( disable:4786 ) #pragma warning(disable:4800) // int forced to bool #include <string> #include <vector>

    typedef std::string String; typedef std::vector< String > StringVector;

    class Translate; // forward reference ascii translation table. class CleanLanguage { public: CleanLanguage(void); ~CleanLanguage(void);

    // will analyze an input string for foul language, and will replace // objectionable language with non-offending ascii text in the ouput // string. Returns count of the number of offending words found. int WashYourMouth(const String &input,String &output) const;

    // Check this potential 'bad' word against the word list. // If the word was objectionable it will be replaced with an // appropriate alternative in the String 'good'. If the word // is not offensive, then good will simply contain the original // string passed. The return value is the percentage match found // between the offending word and one in the database. int CheckWord(const String &bad,String &good) const;

    static bool IsAlpha(char c);

    static bool WildCard(const char *wild,const char *match);

    private:

    static bool Add(char c,char *clean,int &index,int maxlen);

    Translate *mTranslate; // translation table. StringVector mWords; // bad words and their alternates. unsigned char mWildChar; // wild card character. };

    #endif

    Currently browsing [fuzzycompare.zip] (29,306 bytes) - [gestalt.cpp] - (1,147 bytes)

    // GESTALT.C : A fuzzy compare.
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>

    #include "gestalt.h"

    int Gestalt::FuzzyCompare(const char *s1,const char *s2) {

    int l1 = strlen(s1); int l2 = strlen(s2);

    if (l1 == 1) if (l2 == 1) if (*s1 == *s2) return(100);

    return(200 * GCsubstr(s1, s1+l1, s2, s2+l2)/ (l1+l2)); }

    int Gestalt::GCsubstr(const char *st1,const char *end1,const char *st2,const char *end2) { if (end1 <= st1) return 0; if (end2 <= st2) return 0; if (end1 == (st1+1) && end2 == (st2+1) ) return 0;

    const char *s1; const char *s2; int max = 0; const char *b1 = end1; const char *b2 = end2;

    for (const char *a1 = st1; a1 < b1; a1++) { for (const char * a2 = st2; a2 < b2; a2++) { if (*a1 == *a2) {

    for (int i=1; a1[i] && (a1[i] == a2[i]); i++);

    if (i > max) { max = i; s1 = a1; s2 = a2; b1 = end1 - max; b2 = end2 - max; }

    } } }

    if (!max) return 0;

    max += GCsubstr(s1+max, end1, s2+max, end2); max += GCsubstr(st1, s1, st2, s2);

    return max; }


    Currently browsing [fuzzycompare.zip] (29,306 bytes) - [gestalt.h] - (338 bytes)

    #ifndef GESTALT_H

    #define GESTALT_H

    // Fuzzy compare class. Returns similarity of two strings as a // percentage value 0-100. class Gestalt { public: static int FuzzyCompare(const char *s1,const char *s2); private: static int GCsubstr(const char *st1,const char *end1,const char *st2,const char *end2); };

    #endif

    Currently browsing [fuzzycompare.zip] (29,306 bytes) - [rand.h] - (3,273 bytes)

    #ifndef RAND_H

    #define RAND_H

    //** Random number class and Random number pool class. This acts as a //** replacement for the stdlib rand() function. Why would you want to //** replace the rand() function? Because you often want a deteriminstic //** random number generator that is exactly the same regardless of what //** machine or compiler you build your code on. The stdlib rand() function //** is not guaraenteed to produce the same results on different stdlib //** implementations. //** //** You can also maintain any number of unique random number generators //** as state machines by instancing this rand class over and over again. //** //** The random number pool is a data structure which you allocate if you //** want to pull items from a data set, randomly, but without any //** duplications. A simple example would be a deck of cards. You would //** instantiate the random number pool as follows: //** //** RandPool deck(52); //** //** You would then pull cards from the deck as follows: //** //** bool shuffled; //** int card = deck.Get(shuffled); //** //** This will return a number between 0-51 (representing a card in the deck) //** without ever reporting the same card twice until the deck has been //** exhausted. If the boolean 'shuffled' is true, then the deck was //** re-shuffled on that call. This data structure has lots of uses in //** computer games where you want to randomly select data from a fixed //** pool size. //** //** This code submitted to FlipCode.com on July 23, 2000 by John W. Ratcliff //** It is released into the public domain on the same date. class Rand { public:

    Rand(int seed=0) { mCurrent = seed; };

    // random number between 0 - 32767 int Get(void) { return(((mCurrent = mCurrent * 214013L + 2531011L) >> 16) & 0x7fff); };

    // random number between 0.0 and 1.0 float GetFloat(void) { return float(Get())*(1.0f/32767.0f); };

    void Set(int seed) { mCurrent = seed; };

    private: int mCurrent; };

    class RandPool { public: RandPool(int size,int seed) // size of random number bool. { mRand.Set(seed); // init random number generator. mData = new int[size]; // allocate memory for random number bool. mSize = size; mTop = mSize; for (int i=0; i<mSize; i++) mData[i] = i; } ~RandPool(void) { delete mData; };

    // pull a number from the random number pool, will never return the // same number twice until the 'deck' (pool) has been exhausted. // Will set the shuffled flag to true if the deck/pool was exhausted // on this call. int Get(bool &shuffled) { if ( mTop == 0 ) // deck exhausted, shuffle deck. { shuffled = true; mTop = mSize; } else shuffled = false; int entry = mRand.Get()%mTop; mTop--; int ret = mData[entry]; // swap top of pool with entry mData[entry] = mData[mTop]; // returned mData[mTop] = ret; return ret; };

    private: Rand mRand; // random number generator. int *mData; // random number bool. int mSize; // size of random number pool. int mTop; // current top of the random number pool. };

    #endif


    Currently browsing [fuzzycompare.zip] (29,306 bytes) - [test.cpp] - (1,109 bytes)

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <assert.h>
    #include <conio.h>

    #include "cleanl.h"

    void main(int argc,char **argv) {

    CleanLanguage filter;

    printf("Enter test phrase with potentially objectional language\n"); printf("Or enter 'bye', 'quit', or 'end' to exit:\n");

    bool done=false; char buf[1024];

    do { if( fgets( buf, sizeof( buf ), stdin ) == NULL ) { done = true; break; }

    String command = buf;

    if ( command == "bye\n" || command == "end\n" || command == "quit\n" ) { done = true; break; } else { String output;

    int count = filter.WashYourMouth(command,output);

    if ( count ) { if ( count == 1 ) printf("*** One objectionable word encountered!\n"); else printf("*** %d objectionable words encountered!\n",count);

    printf("*** Non-objectionable version is:\n"); printf("%s\n",output.c_str()); } else printf("Input is ok.\n"); }

    } while ( !done ); }

    Currently browsing [fuzzycompare.zip] (29,306 bytes) - [translate.h] - (1,448 bytes)

    #ifndef TRANSLATE_H

    #define TRANSLATE_H

    // Build a random ASCII translation table between one alphabet and another. #include "rand.h"

    class Translate { public: Translate(int key) // build ASCII translation table. { RandPool pool(255,key); // random number pool. for (int i=0; i<255; i++) { bool shuffled; unsigned char v = pool.Get(shuffled)+1; mTableIn[i+1] = v; mTableOut[v] = i+1; } mTableIn[0] = 0; // zero always translates to zero! mTableOut[0] = 0; };

    unsigned char TranslateIn(unsigned char c) const { return mTableIn[c]; };

    unsigned char TranslateOut(unsigned char c) const { return mTableOut[c]; };

    int TranslateIn(unsigned char *foo) { // translate asciiz string into or out of translation table int len = 0; while ( *foo ) { unsigned char v = *foo; v = TranslateIn(v); *foo++ = v; len++; } return len; // returns length of string translated. };

    int TranslateOut(unsigned char *foo) { // translate asciiz string into or out of translation table int len = 0; while ( *foo ) { unsigned char v = *foo; v = TranslateOut(v); *foo++ = v; len++; } return len; // returns length of string translated. };

    private: unsigned char mTableIn[256]; unsigned char mTableOut[256]; };

    #endif

    The zip file viewer built into the Developer Toolbox made use of the zlib library, as well as the zlibdll source additions.

     

    Copyright 1999-2008 (C) FLIPCODE.COM and/or the original content author(s). All rights reserved.
    Please read our Terms, Conditions, and Privacy information.