Pages

Removing stop words from Text using java

What is stop words?

Stop words are those words that frequently occur in a language and does not defined the relevance of an document against a user query. It also help us to keep the dictionary size less.

Example of stop word like is, am, the, where, your, you

Here I am using a standard stop list to remove these words from our text.
package com.esc.xyz;

import java.util.Arrays;
import java.util.HashSet;

/**
 * @author xyz version 1.0.0
 */
public class StopWordRemoval {

    String[] stopWords = { "a", "about", "above", "across", "after", "again",
            "against", "all", "almost", "alone", "along", "already", "also",
            "although", "always", "among", "an", "and", "another", "any",
            "anybody", "anyone", "anything", "anywhere", "are", "area",
            "areas", "around", "as", "ask", "asked", "asking", "asks", "at",
            "away", "b", "back", "backed", "backing", "backs", "be", "became",
            "because", "become", "becomes", "been", "before", "began",
            "behind", "being", "beings", "best", "better", "between", "big",
            "both", "but", "by", "c", "came", "can", "cannot", "case", "cases",
            "certain", "certainly", "clear", "clearly", "come", "could", "d",
            "did", "differ", "different", "differently", "do", "does", "done",
            "down", "down", "downed", "downing", "downs", "during", "e",
            "each", "early", "either", "end", "ended", "ending", "ends",
            "enough", "even", "evenly", "ever", "every", "everybody",
            "everyone", "everything", "everywhere", "f", "face", "faces",
            "fact", "facts", "far", "felt", "few", "find", "finds", "first",
            "for", "four", "from", "full", "fully", "further", "furthered",
            "furthering", "furthers", "g", "gave", "general", "generally",
            "get", "gets", "give", "given", "gives", "go", "going", "good",
            "goods", "got", "great", "greater", "greatest", "group", "grouped",
            "grouping", "groups", "h", "had", "has", "have", "having", "he",
            "her", "here", "herself", "high", "high", "high", "higher",
            "highest", "him", "himself", "his", "how", "however", "i", "if",
            "important", "in", "interest", "interested", "interesting",
            "interests", "into", "is", "it", "its", "itself", "j", "just", "k",
            "keep", "keeps", "kind", "knew", "know", "known", "knows", "l",
            "large", "largely", "last", "later", "latest", "least", "less",
            "let", "lets", "like", "likely", "long", "longer", "longest", "m",
            "made", "make", "making", "man", "many", "may", "me", "member",
            "members", "men", "might", "more", "most", "mostly", "mr", "mrs",
            "much", "must", "my", "myself", "n", "necessary", "need", "needed",
            "needing", "needs", "never", "new", "new", "newer", "newest",
            "next", "no", "nobody", "non", "noone", "not", "nothing", "now",
            "nowhere", "number", "numbers", "o", "of", "off", "often", "old",
            "older", "oldest", "on", "once", "one", "only", "open", "opened",
            "opening", "opens", "or", "order", "ordered", "ordering", "orders",
            "other", "others", "our", "out", "over", "p", "part", "parted",
            "parting", "parts", "per", "perhaps", "place", "places", "point",
            "pointed", "pointing", "points", "possible", "present",
            "presented", "presenting", "presents", "problem", "problems",
            "put", "puts", "q", "quite", "r", "rather", "really", "right",
            "right", "room", "rooms", "s", "said", "same", "saw", "say",
            "says", "second", "seconds", "see", "seem", "seemed", "seeming",
            "seems", "sees", "several", "shall", "she", "should", "show",
            "showed", "showing", "shows", "side", "sides", "since", "small",
            "smaller", "smallest", "so", "some", "somebody", "someone",
            "something", "somewhere", "state", "states", "still", "still",
            "such", "sure", "t", "take", "taken", "than", "that", "the",
            "their", "them", "then", "there", "therefore", "these", "they",
            "thing", "things", "think", "thinks", "this", "those", "though",
            "thought", "thoughts", "three", "through", "thus", "to", "today",
            "together", "too", "took", "toward", "turn", "turned", "turning",
            "turns", "two", "u", "under", "until", "up", "upon", "us", "use",
            "used", "uses", "v", "very", "w", "want", "wanted", "wanting",
            "wants", "was", "way", "ways", "we", "well", "wells", "went",
            "were", "what", "when", "where", "whether", "which", "while",
            "who", "whole", "whose", "why", "will", "with", "within",
            "without", "work", "worked", "working", "works", "would", "x", "y",
            "year", "years", "yet", "you", "young", "younger", "youngest",
            "your", "yours", "z" };

    String[] words = { "Ram", "is", "a", "good", "boy", "boy", "the", "where",
            "your", "yours", "girl", "girl", "ram", "the", "at", "at", "on" };

    public void removeStopWord() {
        HashSet<String> wordWithStopWord = new HashSet<String>(
                Arrays.asList(words));
        HashSet<String> StopWordsSet = new HashSet<>(Arrays.asList(stopWords));
        wordWithStopWord.removeAll(StopWordsSet);
        System.out.println(wordWithStopWord);
    }

    public static void main(String[] args) {
        StopWordRemoval stpRemove = new StopWordRemoval();
        stpRemove.removeStopWord();
    }
}

OUTPUT

without stop words = [Ram, girl, ram, boy]

Getting unique words and word frequencies for a give array in JAVA

 
package com.esc.xyz;

/**
 * This class provide basic utility function for word.
 * 
 * @author xyz
 * @version 1.0.0
 * 
 * 
 *
 */

import java.util.Set;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;


public class WordUtil {

    /**
     * return the size of array
     * 
     * @param words
     * @return int
     * 
     * 
     */
    
    public int getSize(String[] words) {
        int size = words.length;

        return size;
    }

    /**
     * Provides unique word in a string
     * 
     * @param words
     * @return Set
     */
    
    public Set<String> getUniqueWords(String[] words) {

        Set<String> uniqueWords = new HashSet<String>();
        for (String word : words) {
            uniqueWords.add(word.toLowerCase());
        }

        System.out.println(uniqueWords);
        return uniqueWords;
    }

    /**
     * Provide the word frequencie of the words given in string
     * 
     * @param words
     * @return Map
     */
    public Map<String, Integer> getFreuency(String[] words) {

        Map<String, Integer> wordsFrequencies = new HashMap<>();

        for (String word : words) {
            
            Integer index = wordsFrequencies.get(word.toLowerCase());

            wordsFrequencies.put(word.toLowerCase(), (index == null) ? 1 : index + 1);

        }

        System.out.println(wordsFrequencies);
        return wordsFrequencies;
    }

    public static void main(String[] args) {

        String[] words = { "Ram", "is", "a", "Ram", "good", "boy", "boy",
                "girl", "girl","ram","the","at","at","on"};
        WordUtil wordFrequency = new WordUtil();
        wordFrequency.getUniqueWords(words);
        wordFrequency.getFreuency(words);
    }

}