import java.io.*;
import java.text.*;
import java.lang.Math;
import com.oroinc.text.regex.*;

class Stemmer {

  public static void main(String args[]) {
    boolean debug = false;
    final int STEM = 0, FIND = 1, SUBSTITUTE = 2, MCOUNT = 3;
    int options = STEM;
    if (args.length < 2) {
      System.out.println("Usage: java Stemmer -[sfSm] [ one or more words to stem, space delimited ]");
      System.out.println("-s = show word stems, if no option is specified, this is the default.");
      System.out.println("-f [<pattern word>...] = show if pattern is found in word");
      System.out.println("Example for this option:");
      System.out.println("java Stemmer sses$ accesses ies$ parties");
      System.out.println("-S [<pattern subst word> ...] = show substitutions");
      System.out.println("Example for this option:");
      System.out.println("java Stemmer sses$ ss accesses ies$ y parties");
      System.out.println("-m = show m count for a word.");
      System.out.println("You may only use one of the above options for any particular run of the program.");
      return;
    }
    Collator strCmp = Collator.getInstance();
    PrintWriter Log = new PrintWriter(new nulOutputStream());;
    for (int i = 0; i < args.length; i++) {
      if (strCmp.compare(args[i], "-f") == 0) {
	options = FIND; args[i] = "";
      } else if (strCmp.compare(args[i], "-S") == 0) {
	options = SUBSTITUTE; args[i] = "";
      } else if (strCmp.compare(args[i], "-m") == 0) {
	options = MCOUNT; args[i] = "";
      } else if (strCmp.compare(args[i], "-s") == 0) {
	options = STEM; args[i] = "";
      } else if (strCmp.compare(args[i], "-d") == 0) {
	args[i] = "";
	if (args.length < i+1) {
	  System.out.println("Proper usage is '-d [logFilename]'");
	  return;
	}
	Log.close();
	if (strCmp.compare(args[++i], "stdout") == 0)
	  Log = new PrintWriter(System.out);
	else if (strCmp.compare(args[i], "stderr") == 0)
	  Log = new PrintWriter(System.err);
	else {
	  try {
	    Log = new PrintWriter(new FileOutputStream(args[i], false));
	    debug = true;
	  } catch (FileNotFoundException e) {
	    System.out.println("Proper usage is '-d [logFilename]'");
	    System.out.println(e.getMessage());
	    return;
	  }
	  System.out.println("Using debug output filename, '" + 
			     args[i] + "'");
	}
	args[i] = "";
      }
    }
    if (options == FIND) {
      System.out.println("Finding patterns in words.");
      for (int i = 1; i < args.length-1; i+=2) {
	if (find(args[i], args[i+1], Log)) {
	  System.out.println("pattern: '" + args[i] + 
			     "' found in word: '" + args[i+1] + "'");
	} else {
	  System.out.println("pattern: '" + args[i] + "' was not found " +
			     "in word: '" + args[i+1] + "'");
	}
      }
    } else if (options == SUBSTITUTE) {
      System.out.println("Substituting in words.");
      
      for (int i = 1; i < args.length-2; i+=3) {
	String changed_word = substitute(args[i+2], args[i+1], args[i], 
					 Log);
	if (changed_word != args[i+2]) {
	  System.out.println("Original word: '" + args[i+2] + "', " +
			     "substitution: '" + changed_word + "'");
	} else {
	  System.out.println("Word: '" + args[i+2] + "' unchanged.");
	}
      }
    } else if (options == MCOUNT) {
      System.out.println("Counting M's");
      for (int i = 1; i < args.length; i++) {
	System.out.println("M Count: " + count_Ms(args[i], Log) + 
			   " for word: '" + args[i] + "'");
      }
    } else if (options == STEM) {
      System.out.println("Stemming arguments.");
      for (int i = 1; i < args.length; i++) {
	System.out.println("Original word: '" + args[i] + "', Stem: '" + 
			   stem(args[i], Log) + "'");
      }
    }
    Log.close();
    return;
  }
  

  // This algorithm obtained through 
  // "Speech and Language Processing," Jurafsky and Martin, 
  // Prentice Hall, 2000, pp 833-836.
  public static String stem(String word, PrintWriter Log) {
    String stemmed_word, prev_word;
    // The Porter Pieces...
    int measure;
    

    // The order here is important, as "s$" could match the same things as
    // "ss$" and "sses$".
    //
    // Step 1, Plurals, and Third Person Singular.
    // SSES -> SS
    // IES -> I
    // SS -> SS, in our case, no action.
    // S -> 
    //
    stemmed_word = word;
    prev_word = stemmed_word;
    if (find(".*sses$", word, Log))   
      stemmed_word = substitute(word, "sses$", "ss", Log);
    else if (find(".*ies$", word, Log)) 
      stemmed_word = substitute(word, "ies$", "i", Log);
    else if (find(".*ss$", word, Log)) ;
    else if (find(".*s$", word, Log)) 
      stemmed_word = substitute(word, "s$", "", Log); 

    //    if (prev_word != stemmed_word) 
    //      Log.println(prev_word + " became " + stemmed_word + " in step 1");
    prev_word = stemmed_word;
    // Step 2a, Verbal Past Tense and Progressive Forms.
    // *v* -- contains a vowel.
    int m = count_Ms(stemmed_word, Log);
    boolean useStep2b = false;
    // (m > 1) EED -> EE
    if ((m - count_Ms("eed", Log)) > 1 && find(".*eed$", stemmed_word, Log)) 
      stemmed_word = substitute(stemmed_word, "eed$", "ee", Log);
    else if (find(".*[aeiou].*", stemmed_word, Log)) {
      // (*v*) ED -> ""
      if (find(".*ed$", stemmed_word, Log)) {
	useStep2b = true;
	stemmed_word = substitute(stemmed_word, "ed$", "", Log);
	// (*v*) ING -> ""
      } else if (find(".*ing$", stemmed_word, Log)) {
	useStep2b = true;
	stemmed_word = substitute(stemmed_word, "ing$", "", Log);
      }
    }
    //    if (prev_word != stemmed_word)
    //      Log.println(prev_word + " became " + stemmed_word + " in step 2a");
    prev_word = stemmed_word;
    
    // Step 2b, Cleanup...from step 2a.
    if (useStep2b) {
      // AT -> ATE
      if (find(".*at$", stemmed_word, Log)) 
	stemmed_word = substitute(stemmed_word, "at$", "ate", Log);
      // BL -> BLE
      if (find(".*bl$", stemmed_word, Log))
	stemmed_word = substitute(stemmed_word, "bl$", "ble", Log);
      // IZ -> IZE
      if (find(".*iz$", stemmed_word, Log))
	stemmed_word = substitute(stemmed_word, "iz$", "ize", Log);
      // (*d & !(*L or *S or *Z)) -> single letter
      // in other words, if the word ends in a double consonant, but 
      // does not end in L, S, or Z, then replace the double letters with 
      // single letters.
      if (!find(".*ss$", stemmed_word, Log) && 
	  !find(".*ll$", stemmed_word, Log) &&
	  !find(".*zz$", stemmed_word, Log) && doubleConsonant(stemmed_word))
	stemmed_word = removeLastLetter(stemmed_word);
      m = count_Ms(stemmed_word, Log);
      // (m=1 & *o) "" -> E
      // *o = the word ends in a CVC, that does not end in W, X, or Y.
      if (m == 1 && PorterCVC(stemmed_word))
	stemmed_word += "e";
    }
    //    if (prev_word != stemmed_word)
    //      Log.println(prev_word + " became " + stemmed_word + " in step 2b");
    prev_word = stemmed_word;
    
    // Step 3
    // *v* Y -> I
    // *v* -- contains a vowel
    if (find(".*[aeiou].*", stemmed_word, Log)) 
      stemmed_word = substitute(stemmed_word, "y$", "i", Log);
    //    if (prev_word != stemmed_word)
    //      Log.println(prev_word + " became " + stemmed_word + " in step 3");
    prev_word = stemmed_word;
    
    // Step 4: Derivational Morphology I: Multiple Suffixes
    // (m > 0) ATIONAL -> ATE
    // (m > 0) TIONAL  -> TION
    // (m > 0) ENCI    -> ENCE
    // (m > 0) ANCI    -> ANCE
    // (m > 0) IZER    -> IZE
    // (m > 0) ABLI    -> ABLE
    // (m > 0) ALLI    -> AL
    // (m > 0) ENTLI   -> ENT
    // (m > 0) ELI     -> e
    // (m > 0) OUSLI   -> OUS
    // (m > 0) IZATION -> IZE
    // (m > 0) ATION   -> ATE
    // (m > 0) ATOR    -> ATE
    // (m > 0) ALISM   -> AL
    // (m > 0) IVENESS -> IVE
    // (m > 0) FULNESS -> FUL
    // (m > 0) OUSNESS -> OUS
    // (m > 0) ALITI   -> AL
    // (m > 0) IVITI   -> IVE
    // (m > 0) BILITI  -> BLE
    m = count_Ms(stemmed_word, Log);
    if ((m - count_Ms("ational", Log)) > 0 &&
	find(".*ational$", stemmed_word, Log)) 
      stemmed_word = substitute(stemmed_word, "ational$", "ate", Log);
    else if ((m - count_Ms("tional", Log)) > 0 &&
	     find(".*tional$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "tional$", "tion", Log);
    else if ((m - count_Ms("enci", Log)) > 0 &&
	     find(".*enci$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "enci$", "ence", Log);
    else if ((m - count_Ms("anci", Log)) > 0 &&
	     find(".*anci$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "anci$", "ance", Log);
    else if ((m - count_Ms("izer", Log)) > 0 && 
	     find(".*izer$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "izer$", "ize", Log);
    else if ((m - count_Ms("abli", Log)) > 0 &&
	     find(".*abli$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "abli$", "able", Log);
    else if ((m - count_Ms("alli", Log)) > 0 &&
	     find(".*alli$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "alli$", "al", Log);
    else if ((m - count_Ms("entli", Log)) > 0 &&
	     find(".*entli$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "entli$", "ent", Log);
    else if ((m - count_Ms("eli", Log)) > 0 &&
	     find(".*eli$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "eli$", "e", Log);
    else if ((m - count_Ms("ousli", Log)) > 0 &&
	     find(".*ousli$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ousli$", "ous", Log);
    else if ((m - count_Ms("ization", Log)) > 0 &&
	     find(".*ization$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ization$", "ize", Log);
    else if ((m - count_Ms("ation", Log)) > 0 &&
	     find(".*ation$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ation$", "ate", Log);
    else if ((m - count_Ms("ator", Log)) > 0 &&
	     find(".*ator$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ator$", "ate", Log);
    else if ((m - count_Ms("alism", Log)) > 0 &&
	     find(".*alism$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "alism$", "al", Log);
    else if ((m - count_Ms("iveness", Log)) > 0 &&
	     find(".*iveness$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "iveness$", "ive", Log);
    else if ((m - count_Ms("fulness", Log)) > 0 &&
	     find(".*fulness$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "fulness$", "ful", Log);
    else if ((m - count_Ms("ousness", Log)) > 0 &&
	     find(".*ousness$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ousness$", "ous", Log);
    else if ((m - count_Ms("aliti", Log)) > 0 &&
	     find(".*aliti$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "aliti$", "al", Log);
    else if ((m - count_Ms("iviti", Log)) > 0 &&
	     find(".*iviti$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "iviti$", "ive", Log);
    else if ((m - count_Ms("biliti", Log)) > 0 &&
	     find(".*biliti$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "biliti$", "ble", Log);
    //    if (prev_word != stemmed_word) {
    //      Log.println(prev_word + " became " + stemmed_word + " in step 4");
    //    }
    prev_word = stemmed_word;
    // Step 5: Derivational Morphology II: More multiple suffixes
    // (m > 0) ICATE -> IC
    // (m > 0) ATIVE -> ""
    // (m > 0) ALIZE -> AL
    // (m > 0) ICITI -> IC
    // (m > 0) FUL   -> ""
    // (m > 0) NESS  -> "" 
    // In the below there is a modification to the standard algorithm, due
    // to a bug in the measure of a word.  The measure of the extension is 
    // not taken into account in the stock algorithm, so we have to do this
    // manually.  
    m = count_Ms(stemmed_word, Log);
    if ((m - count_Ms("icate", Log)) > 0 &&
	find(".*icate$", stemmed_word, Log)) 
      stemmed_word = substitute(stemmed_word, "icate$", "ic", Log);
    else if ((m - count_Ms("ative", Log)) > 0 &&
	     find(".*ative$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ative$", "", Log);
    else if ((m - count_Ms("alize", Log)) > 0 &&
	     find(".*alize$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "alize$", "al", Log);
    else if ((m - count_Ms("iciti", Log)) > 0 &&
	     find(".*iciti$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "iciti$", "ic", Log);
    else if ((m - count_Ms("ful", Log)) > 0 &&
	     find(".*ful$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ful$", "", Log);
    else if ((m - count_Ms("ness", Log)) > 0 &&
	     find(".*ness$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ness$", "", Log);
    //    if (prev_word != stemmed_word) {
    //      Log.println(prev_word + " became " + stemmed_word + " in step 5");
    //    }
    prev_word = stemmed_word;
    // Step 6: Derivational Morphology III: Single suffixes
    // (m > 1) AL               -> ""
    // (m > 1) ANCE             -> ""
    // (m > 1) ENCE             -> ""
    // (m > 1) ER               -> ""
    // (m > 1) IC               -> ""
    // (m > 1) ABLE             -> ""
    // (m > 1) ANT              -> ""
    // (m > 1) EMENT            -> ""
    // (m > 1) MENT             -> ""
    // (m > 1) ENT              -> ""
    // (m > 1) (*S or *T) & ION -> ""
    // (m > 1) OU               -> ""
    // (m > 1) ISM              -> ""
    // (m > 1) ATE              -> ""
    // (m > 1) ITI              -> ""
    // (m > 1) OUS              -> ""
    // (m > 1) IVE              -> ""
    // (m > 1) IZE              -> ""
    m = count_Ms(stemmed_word, Log);
    if ((m - count_Ms("ali", Log)) > 1 &&
	find(".*al$", stemmed_word, Log)) 
      stemmed_word = substitute(stemmed_word, "al$", "", Log);
    else if ((m - count_Ms("ance", Log)) > 1 &&
	     find(".*ance$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ance$", "", Log);
    else if ((m - count_Ms("ence", Log)) > 1 &&
	     find(".*ence$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ence$", "", Log);
    else if ((m - count_Ms("er", Log)) > 1 &&
	     find(".*er$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "er$", "", Log);
    else if ((m - count_Ms("ic", Log)) > 1 &&
	     find(".*ic$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ic$", "", Log);
    else if ((m - count_Ms("able", Log)) > 1 &&
	     find(".*able$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "able$", "", Log);
    else if ((m - count_Ms("ant", Log)) > 1 &&
	     find(".*ant$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ant$", "", Log);
    else if ((m - count_Ms("ement", Log)) > 1 &&
	     find(".*ement$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ement$", "", Log);
    else if ((m - count_Ms("ment", Log)) > 1 && 
	     find(".*ment$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ment$", "", Log);
    else if ((m - count_Ms("sion", Log)) > 1 &&
	     find(".*sion$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "sion$", "s", Log);
    else if ((m - count_Ms("tion", Log)) > 1 &&
	     find(".*tion$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "tion$", "t", Log);
    else if ((m - count_Ms("ou", Log)) > 1 &&
	     find(".*ou$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ou$", "", Log);
    else if ((m - count_Ms("ism", Log)) > 1 &&
	     find(".*ism$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ism$", "", Log);
    else if ((m - count_Ms("ate", Log)) > 1 &&
	     find(".*ate$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ate$", "", Log);
    else if ((m - count_Ms("iti", Log)) > 1 &&
	     find(".*iti$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "iti$", "", Log);
    else if ((m - count_Ms("ous", Log)) > 1 &&
	     find(".*ous$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ous$", "", Log);
    else if ((m - count_Ms("ive", Log)) > 1 &&
	     find(".*ive$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ive$", "", Log);
    else if ((m - count_Ms("ize", Log)) > 1 &&
	     find(".*ize$", stemmed_word, Log))
      stemmed_word = substitute(stemmed_word, "ize$", "", Log);
    
    //    if (prev_word != stemmed_word) {
    //      Log.println(prev_word + " became " + stemmed_word + " in step 6");
    //    }
    prev_word = stemmed_word;

    // Step 7a: Cleanup
    // (m > 1)           E -> ""
    // (m = 1 & !*o)     E -> ""
    // *o -- ends in a CVC, but doesn't end in W, X or Y
    m = count_Ms(stemmed_word, Log);
    if (m > 1 || (m == 1 && !PorterCVC(stemmed_word))) {
      if (find(".*e$", stemmed_word, Log))
	stemmed_word = substitute(stemmed_word, "e$", "", Log);
    }
    //    if (prev_word != stemmed_word) 
    //      Log.println(prev_word + " became " + stemmed_word + " in step 7a");
    prev_word = stemmed_word;

    // Step 7b: Cleanup
    // (m > 1 & *d *L) -> single letter
    // *d -- ends in a double consonant
    // *L -- ends in an L
    m = count_Ms(stemmed_word, Log);
    if (m > 1 && find(".*ll$", stemmed_word, Log))
      stemmed_word = removeLastLetter(stemmed_word);

    //    if (prev_word != stemmed_word)
    //      Log.println(prev_word + " became " + stemmed_word + " in step 7b");


    return stemmed_word;
  }

  private static boolean PorterCVC(String word) {
    if (word.length() < 3) return false;
    if (isConsonant(word.charAt(word.length()-3)) &&
	!isConsonant(word.charAt(word.length()-2)) &&
	isConsonant(word.charAt(word.length()-1)) &&
	word.charAt(word.length()-1) != 'w' &&
	word.charAt(word.length()-1) != 'x' &&
	word.charAt(word.length()-1) != 'y')
      return true;
    return false;
  }
	    
  
  private static boolean doubleConsonant(String word) {
    if (word.length() < 2) return false;
    if (isConsonant(word.charAt(word.length()-1)) &&
	word.charAt(word.length()-2) == word.charAt(word.length()-1))
      return true;
    return false;
  }
  
  private static String removeLastLetter(String word) {
    String tmpWord = "";
    for(int i = 0; i < word.length()-1; i++)
      tmpWord += word.charAt(i);
    return tmpWord;
  }
  
  private static int count_Ms(String word, PrintWriter Log) {
    // a word is defined as... (C) (VC)^m (C)
    int i = 0, j = 0, k = 0;
    // so... (C):
    for (i = 0; i < word.length() && isConsonant(word.charAt(i)); i++);
    //    Log.print("'");
    //    for (k = 0; k < i; k++) 
    //      Log.print(word.charAt(k));
    //    Log.println("' makes up the first (C) group.");
    // And...  (VC)^m
    int m;
    for (m = 1;;m++) {
      for (; i < word.length() && !isConsonant(word.charAt(i)); i++);
      //      Log.print("'");
      //      for (; k < i; k++) 
      //	Log.print(word.charAt(k));
      if (i >= word.length()) {
	// If this is true, then we have no C left, so we can simply
	// assume that we are in the last V string, so we need to reduce
	// m by one (because we are not really in a (VC) string...
	m--;
	//	Log.println("' makes up the last (V) group.");
	break;
      }
      for (; i < word.length() && isConsonant(word.charAt(i)); i++);
      //      for (; k < i; k++) 
      //	Log.print(word.charAt(k));
      //      Log.println("' makes up a (VC) group.");
    }
    //    Log.println("The measure of '" + word + "' is " + m + ".");
    return m;
  }
  
  private static boolean isConsonant(char ch) {
    if (ch != 'a' && ch != 'e' && ch != 'i' && ch != 'o' && ch != 'u')
      return true;
    return false;
  }
  
  private static boolean find(String pat, String word, PrintWriter Log) {
    // Begin the ORO Matcher stuff...
    // ACK!!  It's the STL, no its ORO...sometimes this truly becomes
    // insane....
    
    Pattern pattern;
    Perl5Compiler compiler;
    PatternMatcher matcher;

    compiler = new Perl5Compiler();
    matcher = new Perl5Matcher();

    try {
      pattern = compiler.compile(pat, 
				 Perl5Compiler.CASE_INSENSITIVE_MASK);
    } catch(MalformedPatternException e) {
      System.err.println("Bad pattern.");
      System.err.println(e.getMessage());
      return false;
    }
    boolean retVal = matcher.matches(word, pattern);
    //    if (retVal) 
    //      Log.println("Match found for '" + pat + "' in '" + word + "'");
    return retVal;
  }

  private static String substitute(String word, String pat, String subst, 
				   PrintWriter Log) {
    // Begin the ORO Matcher stuff...
    // ACK!!  It's the STL, no its ORO...sometimes this truly becomes
    // insane....
    
    Pattern pattern;
    Perl5Compiler compiler;
    PatternMatcher matcher;
    String result;
    int sub = Util.SUBSTITUTE_ALL;
    int interp = Perl5Substitution.INTERPOLATE_ALL;

    compiler = new Perl5Compiler();
    matcher = new Perl5Matcher();

    try {
      pattern = compiler.compile(pat, 
				 Perl5Compiler.CASE_INSENSITIVE_MASK);
    } catch(MalformedPatternException e) {
      System.err.println("Bad pattern.");
      System.err.println(e.getMessage());
      return word;
    }  
    String retVal =  Util.substitute(matcher, pattern, 
			             new Perl5Substitution(subst, interp),
			             word, sub);
    //    if (retVal != word) 
    //      Log.println("Substituted '" + subst + "' for '" + pat + "' in '" + 
    //		  word + "'");
    return retVal;
  }
}
