termco icon indicating copy to clipboard operation
termco copied to clipboard

Give important and frequent terms a grouping.var argument

Open trinker opened this issue 8 years ago • 2 comments

n of each group

trinker avatar Feb 27 '17 20:02 trinker

will affect the plot methods

trinker avatar May 18 '17 20:05 trinker

frequent_terms <- function(text.var, n = 20, grouping.var = NULL, 
	stopwords = stopwords::stopwords("english"), min.freq = NULL, min.char = 4, 
	max.char = Inf, stem = FALSE, language = "porter", strip = TRUE,
    strip.regex = "[^a-z' ]", alphabetical = FALSE, ...) {
	
	
    if (is.data.frame(text.var)) stop("`text.var` is a `data.frame`; please pass a vector")

    text.var <- stringi::stri_trans_tolower(text.var)

    ## remove nonascii characters
    text.var <- iconv(text.var, "latin1", "ASCII", sub = "")

    ## regex strip of non-word/space characters
    if (isTRUE(strip)) text.var <- gsub(strip.regex, " ", text.var)
    
    if(is.null(grouping.var)) {
        G <- "all"
    } else {
        if (is.list(grouping.var)) {
            m <- unlist(as.character(substitute(grouping.var))[-1])
            G <- sapply(strsplit(m, "$", fixed=TRUE), function(x) {
                x[length(x)]
            })
        } else {
            G <- as.character(substitute(grouping.var))
            G <- G[length(G)]
        }
    }
    
    if(is.null(grouping.var)){
        grouping <- rep("all", length(text.var))
    } else {
        if (isTRUE(grouping.var)) {
            grouping <- seq_along(text.var)
        } else {
            if (is.list(grouping.var) & length(grouping.var)>1) {
                grouping <- grouping.var
            } else {
                grouping <- unlist(grouping.var)
            }
        }
    }

    if(!missing(group.names)) {
        G <- group.names
    }

    DF <- data.frame(text.var, check.names = FALSE, stringsAsFactors = FALSE)
    DF[G] <- grouping
    
    DF <- data.table::data.table(DF)
    
    DF <- DF[, list(text.var = paste(text.var, collapse = ' ')), by = G]
    
    grp <- DF[, G, with = FALSE]
    
    outs <- lapply(seq_len(nrow(DF)),  function(i){
    	
    	cnts <- frequent_terms_helper(DF[['text.var']][i], n = n, stopwords = stopwords, 
    		min.freq = min.freq, min.char = min.char, max.char = max.char, 
    		stem = stem, language = language, strip = strip, 
    		strip.regex = strip.regex, alphabetical = alphabetical
    	)
# browser()
    	
    	out <- as.data.frame(
    		dplyr::bind_cols(grp[rep(i, nrow(cnts)), ], cnts),
    		check.names = FALSE,
    		stringsAsFactors = FALSE
    	)

    	n.words <- attributes(out)[["n.words"]]
    	
	    if (isTRUE(alphabetical)){
	        out <- out[order(out[["term"]]), ]
	    }
	
	    if (n < 1) {
	        n <- round(n * nrow(out), 0)
	    }
	
	    if (n > nrow(out)) {
	        n <- nrow(out)
	    }
	
	    if (is.null(min.freq)) {
	        out2 <- out[out[["frequency"]] >= out[["frequency"]][n], ]
	    } else {
	        out2 <- out[out[["frequency"]] >= min.freq, ]
	        n <- nrow(out2)
	    }
	
	    class(out2) <- c('frequent_terms', class(out))
	    attributes(out2)[["n"]] <- n
	    attributes(out2)[["full"]] <- out
	    attributes(out2)[["n.words"]] <- n.words
	    attributes(out2)[["group.var"]] <- G
	    out2    	
    	
    })
    
}

text.var <- termco::presidential_debates_2012$dialogue
alphabetical <- FALSE
grouping.var <- termco::presidential_debates_2012$person
language <- "porter"
max.char <- Inf
min.char <- 4
min.freq <- NULL
n <- 20
stem <- FALSE
stopwords <- stopwords::stopwords("english")
strip <- TRUE
strip.regex <- "[^a-z' ]"
 


frequent_terms_helper <- function(text.var, n = 20, 
	stopwords = stopwords::stopwords("english"), min.freq = NULL, min.char = 4, 
	max.char = Inf, stem = FALSE, language = "porter", strip = TRUE,
    strip.regex = "[^a-z' ]", alphabetical = FALSE, ...) {

    y <- unlist(stringi::stri_extract_all_words(text.var))
    n.words <- sum(stringi::stri_count_words(text.var), na.rm = TRUE)

    ## stemming
    if (isTRUE(stem)) {
        y <- SnowballC::wordStem(y, language = language)
        if (! is.null(stopwords)) stopwords <- SnowballC::wordStem(stopwords, language = language)
    }

    ## exclude less than the min character cut-off
    y <- y[nchar(y) > min.char - 1]

    ## exclude more than the max character cut-off
    y <- y[nchar(y) < max.char + 1]

    ## data frame of counts
    y <- sort(table(y), TRUE)

    ## stopword removal
    if (!is.null(stopwords)){
        y <- y[!names(y) %in% stopwords]
    }

    out <- data.frame(term = names(y), frequency = c(unlist(y, use.names=FALSE)),
        stringsAsFactors = FALSE, row.names=NULL)

    attributes(out)[["n.words"]] <- n.words
    
    out
}



trinker avatar Feb 17 '18 17:02 trinker