import java.io.*; import java.text.*; import java.lang.Math; import com.oroinc.text.regex.*; class Stemmer { public static void main(String args[]) { boolean debug = false; final int STEM = 0, FIND = 1, SUBSTITUTE = 2, MCOUNT = 3; int options = STEM; if (args.length < 2) { System.out.println("Usage: java Stemmer -[sfSm] [ one or more words to stem, space delimited ]"); System.out.println("-s = show word stems, if no option is specified, this is the default."); System.out.println("-f [...] = show if pattern is found in word"); System.out.println("Example for this option:"); System.out.println("java Stemmer sses$ accesses ies$ parties"); System.out.println("-S [ ...] = show substitutions"); System.out.println("Example for this option:"); System.out.println("java Stemmer sses$ ss accesses ies$ y parties"); System.out.println("-m = show m count for a word."); System.out.println("You may only use one of the above options for any particular run of the program."); return; } Collator strCmp = Collator.getInstance(); PrintWriter Log = new PrintWriter(new nulOutputStream());; for (int i = 0; i < args.length; i++) { if (strCmp.compare(args[i], "-f") == 0) { options = FIND; args[i] = ""; } else if (strCmp.compare(args[i], "-S") == 0) { options = SUBSTITUTE; args[i] = ""; } else if (strCmp.compare(args[i], "-m") == 0) { options = MCOUNT; args[i] = ""; } else if (strCmp.compare(args[i], "-s") == 0) { options = STEM; args[i] = ""; } else if (strCmp.compare(args[i], "-d") == 0) { args[i] = ""; if (args.length < i+1) { System.out.println("Proper usage is '-d [logFilename]'"); return; } Log.close(); if (strCmp.compare(args[++i], "stdout") == 0) Log = new PrintWriter(System.out); else if (strCmp.compare(args[i], "stderr") == 0) Log = new PrintWriter(System.err); else { try { Log = new PrintWriter(new FileOutputStream(args[i], false)); debug = true; } catch (FileNotFoundException e) { System.out.println("Proper usage is '-d [logFilename]'"); System.out.println(e.getMessage()); return; } System.out.println("Using debug output filename, '" + args[i] + "'"); } args[i] = ""; } } if (options == FIND) { System.out.println("Finding patterns in words."); for (int i = 1; i < args.length-1; i+=2) { if (find(args[i], args[i+1], Log)) { System.out.println("pattern: '" + args[i] + "' found in word: '" + args[i+1] + "'"); } else { System.out.println("pattern: '" + args[i] + "' was not found " + "in word: '" + args[i+1] + "'"); } } } else if (options == SUBSTITUTE) { System.out.println("Substituting in words."); for (int i = 1; i < args.length-2; i+=3) { String changed_word = substitute(args[i+2], args[i+1], args[i], Log); if (changed_word != args[i+2]) { System.out.println("Original word: '" + args[i+2] + "', " + "substitution: '" + changed_word + "'"); } else { System.out.println("Word: '" + args[i+2] + "' unchanged."); } } } else if (options == MCOUNT) { System.out.println("Counting M's"); for (int i = 1; i < args.length; i++) { System.out.println("M Count: " + count_Ms(args[i], Log) + " for word: '" + args[i] + "'"); } } else if (options == STEM) { System.out.println("Stemming arguments."); for (int i = 1; i < args.length; i++) { System.out.println("Original word: '" + args[i] + "', Stem: '" + stem(args[i], Log) + "'"); } } Log.close(); return; } // This algorithm obtained through // "Speech and Language Processing," Jurafsky and Martin, // Prentice Hall, 2000, pp 833-836. public static String stem(String word, PrintWriter Log) { String stemmed_word, prev_word; // The Porter Pieces... int measure; // The order here is important, as "s$" could match the same things as // "ss$" and "sses$". // // Step 1, Plurals, and Third Person Singular. // SSES -> SS // IES -> I // SS -> SS, in our case, no action. // S -> // stemmed_word = word; prev_word = stemmed_word; if (find(".*sses$", word, Log)) stemmed_word = substitute(word, "sses$", "ss", Log); else if (find(".*ies$", word, Log)) stemmed_word = substitute(word, "ies$", "i", Log); else if (find(".*ss$", word, Log)) ; else if (find(".*s$", word, Log)) stemmed_word = substitute(word, "s$", "", Log); // if (prev_word != stemmed_word) // Log.println(prev_word + " became " + stemmed_word + " in step 1"); prev_word = stemmed_word; // Step 2a, Verbal Past Tense and Progressive Forms. // *v* -- contains a vowel. int m = count_Ms(stemmed_word, Log); boolean useStep2b = false; // (m > 1) EED -> EE if ((m - count_Ms("eed", Log)) > 1 && find(".*eed$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "eed$", "ee", Log); else if (find(".*[aeiou].*", stemmed_word, Log)) { // (*v*) ED -> "" if (find(".*ed$", stemmed_word, Log)) { useStep2b = true; stemmed_word = substitute(stemmed_word, "ed$", "", Log); // (*v*) ING -> "" } else if (find(".*ing$", stemmed_word, Log)) { useStep2b = true; stemmed_word = substitute(stemmed_word, "ing$", "", Log); } } // if (prev_word != stemmed_word) // Log.println(prev_word + " became " + stemmed_word + " in step 2a"); prev_word = stemmed_word; // Step 2b, Cleanup...from step 2a. if (useStep2b) { // AT -> ATE if (find(".*at$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "at$", "ate", Log); // BL -> BLE if (find(".*bl$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "bl$", "ble", Log); // IZ -> IZE if (find(".*iz$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "iz$", "ize", Log); // (*d & !(*L or *S or *Z)) -> single letter // in other words, if the word ends in a double consonant, but // does not end in L, S, or Z, then replace the double letters with // single letters. if (!find(".*ss$", stemmed_word, Log) && !find(".*ll$", stemmed_word, Log) && !find(".*zz$", stemmed_word, Log) && doubleConsonant(stemmed_word)) stemmed_word = removeLastLetter(stemmed_word); m = count_Ms(stemmed_word, Log); // (m=1 & *o) "" -> E // *o = the word ends in a CVC, that does not end in W, X, or Y. if (m == 1 && PorterCVC(stemmed_word)) stemmed_word += "e"; } // if (prev_word != stemmed_word) // Log.println(prev_word + " became " + stemmed_word + " in step 2b"); prev_word = stemmed_word; // Step 3 // *v* Y -> I // *v* -- contains a vowel if (find(".*[aeiou].*", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "y$", "i", Log); // if (prev_word != stemmed_word) // Log.println(prev_word + " became " + stemmed_word + " in step 3"); prev_word = stemmed_word; // Step 4: Derivational Morphology I: Multiple Suffixes // (m > 0) ATIONAL -> ATE // (m > 0) TIONAL -> TION // (m > 0) ENCI -> ENCE // (m > 0) ANCI -> ANCE // (m > 0) IZER -> IZE // (m > 0) ABLI -> ABLE // (m > 0) ALLI -> AL // (m > 0) ENTLI -> ENT // (m > 0) ELI -> e // (m > 0) OUSLI -> OUS // (m > 0) IZATION -> IZE // (m > 0) ATION -> ATE // (m > 0) ATOR -> ATE // (m > 0) ALISM -> AL // (m > 0) IVENESS -> IVE // (m > 0) FULNESS -> FUL // (m > 0) OUSNESS -> OUS // (m > 0) ALITI -> AL // (m > 0) IVITI -> IVE // (m > 0) BILITI -> BLE m = count_Ms(stemmed_word, Log); if ((m - count_Ms("ational", Log)) > 0 && find(".*ational$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ational$", "ate", Log); else if ((m - count_Ms("tional", Log)) > 0 && find(".*tional$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "tional$", "tion", Log); else if ((m - count_Ms("enci", Log)) > 0 && find(".*enci$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "enci$", "ence", Log); else if ((m - count_Ms("anci", Log)) > 0 && find(".*anci$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "anci$", "ance", Log); else if ((m - count_Ms("izer", Log)) > 0 && find(".*izer$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "izer$", "ize", Log); else if ((m - count_Ms("abli", Log)) > 0 && find(".*abli$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "abli$", "able", Log); else if ((m - count_Ms("alli", Log)) > 0 && find(".*alli$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "alli$", "al", Log); else if ((m - count_Ms("entli", Log)) > 0 && find(".*entli$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "entli$", "ent", Log); else if ((m - count_Ms("eli", Log)) > 0 && find(".*eli$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "eli$", "e", Log); else if ((m - count_Ms("ousli", Log)) > 0 && find(".*ousli$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ousli$", "ous", Log); else if ((m - count_Ms("ization", Log)) > 0 && find(".*ization$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ization$", "ize", Log); else if ((m - count_Ms("ation", Log)) > 0 && find(".*ation$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ation$", "ate", Log); else if ((m - count_Ms("ator", Log)) > 0 && find(".*ator$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ator$", "ate", Log); else if ((m - count_Ms("alism", Log)) > 0 && find(".*alism$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "alism$", "al", Log); else if ((m - count_Ms("iveness", Log)) > 0 && find(".*iveness$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "iveness$", "ive", Log); else if ((m - count_Ms("fulness", Log)) > 0 && find(".*fulness$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "fulness$", "ful", Log); else if ((m - count_Ms("ousness", Log)) > 0 && find(".*ousness$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ousness$", "ous", Log); else if ((m - count_Ms("aliti", Log)) > 0 && find(".*aliti$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "aliti$", "al", Log); else if ((m - count_Ms("iviti", Log)) > 0 && find(".*iviti$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "iviti$", "ive", Log); else if ((m - count_Ms("biliti", Log)) > 0 && find(".*biliti$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "biliti$", "ble", Log); // if (prev_word != stemmed_word) { // Log.println(prev_word + " became " + stemmed_word + " in step 4"); // } prev_word = stemmed_word; // Step 5: Derivational Morphology II: More multiple suffixes // (m > 0) ICATE -> IC // (m > 0) ATIVE -> "" // (m > 0) ALIZE -> AL // (m > 0) ICITI -> IC // (m > 0) FUL -> "" // (m > 0) NESS -> "" // In the below there is a modification to the standard algorithm, due // to a bug in the measure of a word. The measure of the extension is // not taken into account in the stock algorithm, so we have to do this // manually. m = count_Ms(stemmed_word, Log); if ((m - count_Ms("icate", Log)) > 0 && find(".*icate$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "icate$", "ic", Log); else if ((m - count_Ms("ative", Log)) > 0 && find(".*ative$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ative$", "", Log); else if ((m - count_Ms("alize", Log)) > 0 && find(".*alize$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "alize$", "al", Log); else if ((m - count_Ms("iciti", Log)) > 0 && find(".*iciti$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "iciti$", "ic", Log); else if ((m - count_Ms("ful", Log)) > 0 && find(".*ful$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ful$", "", Log); else if ((m - count_Ms("ness", Log)) > 0 && find(".*ness$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ness$", "", Log); // if (prev_word != stemmed_word) { // Log.println(prev_word + " became " + stemmed_word + " in step 5"); // } prev_word = stemmed_word; // Step 6: Derivational Morphology III: Single suffixes // (m > 1) AL -> "" // (m > 1) ANCE -> "" // (m > 1) ENCE -> "" // (m > 1) ER -> "" // (m > 1) IC -> "" // (m > 1) ABLE -> "" // (m > 1) ANT -> "" // (m > 1) EMENT -> "" // (m > 1) MENT -> "" // (m > 1) ENT -> "" // (m > 1) (*S or *T) & ION -> "" // (m > 1) OU -> "" // (m > 1) ISM -> "" // (m > 1) ATE -> "" // (m > 1) ITI -> "" // (m > 1) OUS -> "" // (m > 1) IVE -> "" // (m > 1) IZE -> "" m = count_Ms(stemmed_word, Log); if ((m - count_Ms("ali", Log)) > 1 && find(".*al$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "al$", "", Log); else if ((m - count_Ms("ance", Log)) > 1 && find(".*ance$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ance$", "", Log); else if ((m - count_Ms("ence", Log)) > 1 && find(".*ence$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ence$", "", Log); else if ((m - count_Ms("er", Log)) > 1 && find(".*er$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "er$", "", Log); else if ((m - count_Ms("ic", Log)) > 1 && find(".*ic$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ic$", "", Log); else if ((m - count_Ms("able", Log)) > 1 && find(".*able$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "able$", "", Log); else if ((m - count_Ms("ant", Log)) > 1 && find(".*ant$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ant$", "", Log); else if ((m - count_Ms("ement", Log)) > 1 && find(".*ement$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ement$", "", Log); else if ((m - count_Ms("ment", Log)) > 1 && find(".*ment$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ment$", "", Log); else if ((m - count_Ms("sion", Log)) > 1 && find(".*sion$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "sion$", "s", Log); else if ((m - count_Ms("tion", Log)) > 1 && find(".*tion$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "tion$", "t", Log); else if ((m - count_Ms("ou", Log)) > 1 && find(".*ou$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ou$", "", Log); else if ((m - count_Ms("ism", Log)) > 1 && find(".*ism$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ism$", "", Log); else if ((m - count_Ms("ate", Log)) > 1 && find(".*ate$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ate$", "", Log); else if ((m - count_Ms("iti", Log)) > 1 && find(".*iti$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "iti$", "", Log); else if ((m - count_Ms("ous", Log)) > 1 && find(".*ous$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ous$", "", Log); else if ((m - count_Ms("ive", Log)) > 1 && find(".*ive$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ive$", "", Log); else if ((m - count_Ms("ize", Log)) > 1 && find(".*ize$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "ize$", "", Log); // if (prev_word != stemmed_word) { // Log.println(prev_word + " became " + stemmed_word + " in step 6"); // } prev_word = stemmed_word; // Step 7a: Cleanup // (m > 1) E -> "" // (m = 1 & !*o) E -> "" // *o -- ends in a CVC, but doesn't end in W, X or Y m = count_Ms(stemmed_word, Log); if (m > 1 || (m == 1 && !PorterCVC(stemmed_word))) { if (find(".*e$", stemmed_word, Log)) stemmed_word = substitute(stemmed_word, "e$", "", Log); } // if (prev_word != stemmed_word) // Log.println(prev_word + " became " + stemmed_word + " in step 7a"); prev_word = stemmed_word; // Step 7b: Cleanup // (m > 1 & *d *L) -> single letter // *d -- ends in a double consonant // *L -- ends in an L m = count_Ms(stemmed_word, Log); if (m > 1 && find(".*ll$", stemmed_word, Log)) stemmed_word = removeLastLetter(stemmed_word); // if (prev_word != stemmed_word) // Log.println(prev_word + " became " + stemmed_word + " in step 7b"); return stemmed_word; } private static boolean PorterCVC(String word) { if (word.length() < 3) return false; if (isConsonant(word.charAt(word.length()-3)) && !isConsonant(word.charAt(word.length()-2)) && isConsonant(word.charAt(word.length()-1)) && word.charAt(word.length()-1) != 'w' && word.charAt(word.length()-1) != 'x' && word.charAt(word.length()-1) != 'y') return true; return false; } private static boolean doubleConsonant(String word) { if (word.length() < 2) return false; if (isConsonant(word.charAt(word.length()-1)) && word.charAt(word.length()-2) == word.charAt(word.length()-1)) return true; return false; } private static String removeLastLetter(String word) { String tmpWord = ""; for(int i = 0; i < word.length()-1; i++) tmpWord += word.charAt(i); return tmpWord; } private static int count_Ms(String word, PrintWriter Log) { // a word is defined as... (C) (VC)^m (C) int i = 0, j = 0, k = 0; // so... (C): for (i = 0; i < word.length() && isConsonant(word.charAt(i)); i++); // Log.print("'"); // for (k = 0; k < i; k++) // Log.print(word.charAt(k)); // Log.println("' makes up the first (C) group."); // And... (VC)^m int m; for (m = 1;;m++) { for (; i < word.length() && !isConsonant(word.charAt(i)); i++); // Log.print("'"); // for (; k < i; k++) // Log.print(word.charAt(k)); if (i >= word.length()) { // If this is true, then we have no C left, so we can simply // assume that we are in the last V string, so we need to reduce // m by one (because we are not really in a (VC) string... m--; // Log.println("' makes up the last (V) group."); break; } for (; i < word.length() && isConsonant(word.charAt(i)); i++); // for (; k < i; k++) // Log.print(word.charAt(k)); // Log.println("' makes up a (VC) group."); } // Log.println("The measure of '" + word + "' is " + m + "."); return m; } private static boolean isConsonant(char ch) { if (ch != 'a' && ch != 'e' && ch != 'i' && ch != 'o' && ch != 'u') return true; return false; } private static boolean find(String pat, String word, PrintWriter Log) { // Begin the ORO Matcher stuff... // ACK!! It's the STL, no its ORO...sometimes this truly becomes // insane.... Pattern pattern; Perl5Compiler compiler; PatternMatcher matcher; compiler = new Perl5Compiler(); matcher = new Perl5Matcher(); try { pattern = compiler.compile(pat, Perl5Compiler.CASE_INSENSITIVE_MASK); } catch(MalformedPatternException e) { System.err.println("Bad pattern."); System.err.println(e.getMessage()); return false; } boolean retVal = matcher.matches(word, pattern); // if (retVal) // Log.println("Match found for '" + pat + "' in '" + word + "'"); return retVal; } private static String substitute(String word, String pat, String subst, PrintWriter Log) { // Begin the ORO Matcher stuff... // ACK!! It's the STL, no its ORO...sometimes this truly becomes // insane.... Pattern pattern; Perl5Compiler compiler; PatternMatcher matcher; String result; int sub = Util.SUBSTITUTE_ALL; int interp = Perl5Substitution.INTERPOLATE_ALL; compiler = new Perl5Compiler(); matcher = new Perl5Matcher(); try { pattern = compiler.compile(pat, Perl5Compiler.CASE_INSENSITIVE_MASK); } catch(MalformedPatternException e) { System.err.println("Bad pattern."); System.err.println(e.getMessage()); return word; } String retVal = Util.substitute(matcher, pattern, new Perl5Substitution(subst, interp), word, sub); // if (retVal != word) // Log.println("Substituted '" + subst + "' for '" + pat + "' in '" + // word + "'"); return retVal; } }