- Preprocess dataset, extract information, and discover pattern from a disaster dataset.
Milestone 1: Import JSON file online, change columns’ order, and save as CSV file "DisasterEvent.csv".
----------------------------------------------------- RStudio -------------------------------------------------------
library(RJSONIO)
# From the website
DisasterEventRaw<-fromJSON("http://ist.gmu.edu/~hpurohit/courses/ait690-proj-data-spring17.json")
# Read CSV file
#DisasterEventRaw<-fromJSON("ait690_proj_data_spring17.json")
length(DisasterEventRaw)
# We can coerce this to a data.frame
final_data <- do.call(rbind, DisasterEventRaw)
# Then write it to a flat csv file
final_data <- final_data[,c("LATITUDE","MESSAGE","LONGITUDE","DOCUMENT_ID","DATETIME")]
final_data <- final_data[,c("DOCUMENT_ID","MESSAGE","DATETIME","LATITUDE","LONGITUDE")]
write.csv(final_data, "DisasterEvent.csv")
-----------------------------------------------------------------------------------------------------------------------
Milestone 2
2.1 Separate Date and Time
----------------------------------------------------- RStudio -------------------------------------------------------# Split a column from csv file
# Install tidyr package
library(tidyr)
# Read Dataset
myData = read.csv("DisasterEvent.csv")
View(myData)
# Duplicate the column"DATETIME"
DATETIME2 <- myData$DATETIME
# ?cbind
myData1 <- cbind(myData, DATETIME2)
View(myData1)
# ?separate()
myData2 <- separate(myData1, DATETIME2, c("Date","Time"), sep = " ")
View(myData2)
# Save as CSV file
write.csv(myData2, "DisasterEvent_Split.csv")
-----------------------------------------------------------------------------------------------------------------------
2.2 Data preprocessing
Then, this project use NLP and tm
package and import csv file “DisasterEvent_Split.csv” to remove stop words, URL, punctuation, and numbers.
It also does lowercase. Final step is saving as a CSV file “DisasterEvent_RemoveWords.csv”.
----------------------------------------------------- RStudio -------------------------------------------------------
# Text Mining
library(NLP)
library(tm)
library(SnowballC)
# Read CSV file
myData = read.csv("DisasterEvent_Split.csv",stringsAsFactors = FALSE)
View(myData)
myData$MESSAGE
# Remove Words
myData$MESSAGE <- gsub("\xa1", '', myData$MESSAGE)
myData$MESSAGE <- gsub("\xfd", '', myData$MESSAGE)
myData$MESSAGE <- gsub("\x93", '', myData$MESSAGE)
myData$MESSAGE <- gsub("\x81", '', myData$MESSAGE)
myData$MESSAGE <- gsub("\xf3", '', myData$MESSAGE)
myData$MESSAGE <- gsub("\xfc", '', myData$MESSAGE)
myData$MESSAGE <- gsub("\x9c", '', myData$MESSAGE)
# Remove "¡"
myData$MESSAGE <- gsub("¡", '', myData$MESSAGE)
# Remove URL
myData$MESSAGE <- gsub('http.* *', '', myData$MESSAGE)
myData$MESSAGE <- gsub('http.*\\s*', '', myData$MESSAGE)
myData$MESSAGE <- gsub('http\\S+\\s*', '', myData$MESSAGE)
# View(myData)
# Remove Punctuation
myData$MESSAGE <- removePunctuation(myData$MESSAGE)
# lowercase
myData$MESSAGE <- tolower(myData$MESSAGE)
# Replacing “/”, “@” and “|” with space
myData$MESSAGE <- gsub("<", '', myData$MESSAGE)
myData$MESSAGE <- gsub(">", '', myData$MESSAGE)
myData$MESSAGE <- gsub("/", '', myData$MESSAGE)
myData$MESSAGE <- gsub("@", '', myData$MESSAGE)
myData$MESSAGE <- gsub("\\|", '', myData$MESSAGE)
# Remove Numbers
myData$MESSAGE <- tm::removeNumbers(myData$MESSAGE)
# Remove Stop Words
myData$MESSAGE <- tm_map(myData$MESSAGE, removeWords, stopwords('english'))
View(myData)
# Save as CSV file
write.csv(myData, "DisasterEvent_RemoveWords.csv")
-----------------------------------------------------------------------------------------------------------------------
Milestone 3: Mining Tool Preparation
3.1
This step imports csv file and uses foreign package to change and save arff
file “DisasterEvent.arff”.
----------------------------------------------------- RStudio -------------------------------------------------------
# Step 3.1 csv to ARFF
library(foreign)
# READ CSV file
myData <- read.csv("DisasterEvent_RemoveWords.csv")
write.arff(myData,"DisasterEvent.arff")
-----------------------------------------------------------------------------------------------------------------------
3.2
This step opens file “DisasterEvent.arff”, preprocess to removes attributes
which are no meaningful.
3.3
Apply NorminalToString and StringToWordVector for transferring MESSAGE
string.
Milestone 4: Clustering Analysis
4.1
Apply a clustering algorithm with three clusters.
4.2
Adjust x-lab as Instance_number (NUM), Y-lab as Cluster (Nom), and Color as
Cluster (Nom) for Visualization of clusters in WEKA.
4.3 Result: This step clicks the bottom “Start”
to run the result. The following picture is captured by WEKA. The description
is the content “Run information” Then, the result is stored as CSV file
“WEKA.csv” to provide next milestone to do analytics.
Description:
=== Run
information ===
Scheme: weka.clusterers.SimpleKMeans -init 0
-max-candidates 100 -periodic-pruning 10000 -min-density 2.0 -t1 -1.25 -t2 -1.0
-N 3 -A "weka.core.EuclideanDistance -R first-last" -I 500 -num-slots
1 -S 10
Relation:
myData-weka.filters.unsupervised.attribute.Remove-R1-4,6-10-weka.filters.unsupervised.attribute.NominalToString-Clast-weka.filters.unsupervised.attribute.StringToWordVector-R1-W1000-prune-rate-1.0-N0-stemmerweka.core.stemmers.NullStemmer-stopwords-handlerweka.core.stopwords.Null-M1-tokenizerweka.core.tokenizers.WordTokenizer
-delimiters " \r\n\t.,;:\'\"()?!"
Instances: 3135
Attributes: 1040
[list of attributes omitted]
Test
mode: evaluate on training data
===
Clustering model (full training set) ===
kMeans
======
Number of
iterations: 12
Within
cluster sum of squared errors: 28104.238524634548
Initial
starting points (random):
Cluster 0:
{248 1,337 1,367 1,439 1,482 1,488 1,559 1,573 1,619 1,782 1,856 1,887 1,913 1}
Cluster 1:
{0 1,231 1,248 1,439 1,501 1,529 1,672 1,721 1,734 1,757 1,774 1,827 1,887
1,1016 1}
Cluster 2:
{29 1,131 1,210 1,248 1,325 1,439 1,446 1,529 1,544 1,729 1,734 1,774 1,866
1,887 1,913 1,1033 1}
Missing
values globally replaced with mean/mode
Time taken
to build model (full training data) : 2.08 seconds
=== Model and
evaluation on training set ===
Clustered
Instances
0 1308 ( 42%)
1 1376 ( 44%)
2 451 ( 14%)
Milestone 5:
Process of
milestone 5: The milestone 5 uses CSV
file “WEKA.csv”. This step deletes description, 0 and 1. Then milestone 5
filter three clusters and save as three csv files: cluster0, cluster1, and
cluster2. Then, this project uses RStudio create word clouds to interpret each
cluster.
The DIKW framework
of this project: From AIT690
website, this project download data
which exists many attributes: ORDER, DOCUMENT_ID,
MESSAGE, DATETIME, LATITUDE, and LONGITUDE. After preprocessing data, new
tables can show understanding relations. Then, this project transfers new
tables to arff file and put into WEKA. By WEKA, the dataset is clustered, and
WEKA shows three clusters’ report and result to form information. From information, this project puts clusters into
RStudio to create word clouds. The word clouds show different words from big to
small by ranking and format a pattern. From word clouds, three words which are
help, victims, donate occur in three clusters top ten most frequent word. Three
word clouds create understanding patterns about Red Cross donation, donating
food, and many efforts to help victims and millions of people in disasters. These
understanding patterns format knowledge.
Knowledge is structured by patterns. The patterns come from word clouds. The
word clouds provide sequence understanding to give human knowledge to help and
donate victims and people who need help in disasters. When people understand
these principles, human can follow the concepts to form wisdom. In the future, wisdom could provide future works to support
and format understanding principles to sapience. By sapience, people can
realize to help and donate victims when disasters happen in the future.
Data: dataset from AIT690 website. The attributes include ORDER, DOCUMENT_ID, MESSAGE, DATETIME, LATITUDE, and LONGITUDE.
Information: This project use WEKA to create three clusters and
then use RStudio to import three clusters to format three word clouds. These
three word clouds could explain each clusters’ details.
Knowledge: Three word clouds’ details are explained for human.
Then, human get information and then understand the patterns. Finally, they
format knowledge.
This process is to delete redundant cells.
Cluster 0
From cluster0, this project uses RStudio to create a word cloud include words which are more than 15 times. Then, next step calculates top 10 frequent words and create a word cloud to show the words
----------------------------------------------------- RStudio -------------------------------------------------------
# Milestone 5 - Cluster0
library(NLP)
library(tm)
library(SnowballC)
library(RColorBrewer)
library(wordcloud)
# Read Cluster0
myCluster0 <- read.csv("Cluster0.csv")
View(myCluster0)
# dataframe to string
myData <- toString(myCluster0$MESSAGE)
myData
# Load Cluster0 as a Corpus
myCluster0Corpus <- Corpus(VectorSource(myData))
myCluster0Corpus
# Remove Stopwords
myCluster0Corpus <- tm_map(myCluster0Corpus, removeWords, stopwords('english'))
# Rmove Words
myCluster0Corpus <- tm_map(myCluster0Corpus, removeWords, c("can","hurricane","sandy","sandyhelp"))
# Remove punctuations
myCluster0Corpus <- tm_map(myCluster0Corpus, removePunctuation)
myCluster0Corpus
# Word cloud
# Build a term-document matrix
dtm <- TermDocumentMatrix(myCluster0Corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 15,
max.words=450, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# TOP 10
# Plot word frequencies
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
# Generate the Word cloud (TOP 10)
set.seed(1234)
wordcloud(words = d[1:10,]$word, freq = d[1:10,]$freq, min.freq = 15,
max.words=450, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
-----------------------------------------------------------------------------------------------------------------------
Cluster 1
From this cluster, this project uses RStudio to
create a word cloud include words which are more than 15 times. Then, next step
calculates top 10 frequent words and create a word cloud to show the words.
----------------------------------------------------- RStudio -------------------------------------------------------
# Milestone 5 - Cluster1
library(NLP)
library(tm)
library(SnowballC)
library(RColorBrewer)
library(wordcloud)
# Read Cluster1
myCluster1 <- read.csv("Cluster1.csv")
View(myCluster1)
# dataframe to string
myData <- toString(myCluster1$MESSAGE)
myData
# Load Cluster1 as a Corpus
myCluster1Corpus <- Corpus(VectorSource(myData))
myCluster1Corpus
# Remove Stopwords
myCluster1Corpus <- tm_map(myCluster1Corpus, removeWords, stopwords('english'))
# Rmove Words
myCluster1Corpus <- tm_map(myCluster1Corpus, removeWords, c("can","hurricane","sandy","sandyhelp"))
# Remove punctuations
myCluster1Corpus <- tm_map(myCluster1Corpus, removePunctuation)
myCluster1Corpus
# Word cloud
# Build a term-document matrix
dtm <- TermDocumentMatrix(myCluster1Corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 15,
max.words=450, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# TOP 10
# Plot word frequencies
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
# Generate the Word cloud (TOP 10)
set.seed(1234)
wordcloud(words = d[1:10,]$word, freq = d[1:10,]$freq, min.freq = 15,
max.words=470, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
-----------------------------------------------------------------------------------------------------------------------
Cluster 2
From cluster2, this project uses RStudio to create a
word cloud include words which are more than 15 times. Then, next step
calculates top 10 frequent words and create a word cloud to show the words.
----------------------------------------------------- RStudio -------------------------------------------------------
# Milestone 5 - Cluster2
library(NLP)
library(tm)
library(SnowballC)
library(RColorBrewer)
library(wordcloud)
# Read Cluster2
myCluster2 <- read.csv("Cluster2.csv")
View(myCluster2)
# dataframe to string
myData <- toString(myCluster2$MESSAGE)
myData
# Load Cluster2 as a Corpus
myCluster2Corpus <- Corpus(VectorSource(myData))
myCluster2Corpus
# Remove Stopwords
myCluster2Corpus <- tm_map(myCluster2Corpus, removeWords, stopwords('english'))
# Rmove Words
myCluster2Corpus <- tm_map(myCluster2Corpus, removeWords, c("can","hurricane","sandy","sandyhelp"))
# Remove punctuations
myCluster2Corpus <- tm_map(myCluster2Corpus, removePunctuation)
myCluster2Corpus
# Word cloud
# Build a term-document matrix
dtm <- TermDocumentMatrix(myCluster2Corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 15,
max.words=450, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# TOP 10
# Plot word frequencies
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
# Generate the Word cloud (TOP 10)
set.seed(1234)
wordcloud(words = d[1:10,]$word, freq = d[1:10,]$freq, min.freq = 15,
max.words=470, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
-----------------------------------------------------------------------------------------------------------------------
Final - Topic Modeling
This step uses RStudio load three csv files
“cluster0.csv”, “cluster1.csv”, and “cluster2.csv” and six packages: NLP, tm, RColorBrewer, wordcloud, topicmodels,
and SnowballC. Then, this step loads each file as a corpus. Then, this step includes
creating word clouds to test corpus. Final step uses each cluster file to
create topic modeling.
Cluster0
# Topic modeling
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(topicmodels)
library(SnowballC)
# cluster0
cluster0 <- read.csv("Cluster0.csv")
# dataframe to string
cluster0 <- toString(cluster0$MESSAGE)
cluster0
# Load Cluster0 as a Corpus
cluster0Corpus <- Corpus(VectorSource(cluster0))
cluster0Corpus
# Remove Stopwords
cluster0Corpus <- tm_map(cluster0Corpus, removeWords, stopwords('english'))
# Rmove Words
cluster0Corpus <- tm_map(cluster0Corpus, removeWords, c("can","hurricane","sandy","sandyhelp"))
# Remove punctuations
cluster0Corpus <- tm_map(cluster0Corpus, removePunctuation)
cluster0Corpus
# Word Cloud
pal <- brewer.pal(8, "Dark2")
wordcloud(cluster0Corpus, min.freq=15, max.words = 150, random.order = TRUE, col = pal)
# cluster0 Topic modeling
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(cluster0Corpus)))
dtm <- DocumentTermMatrix(cluster0Corpus[doc.lengths > 0])
model <- LDA(dtm, 4)
SEED = sample(1:1000000, 1)
k = 4
models <- list(
CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))),
VEM = LDA(dtm, k = k, control = list(seed = SEED)),
VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000))
)
lapply(models, terms, 4)
assignments <- sapply(models, topics)
assignments
----------------------------------------------------------------------------------------------------------------------
Cluster1
# Topic modeling
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(topicmodels)
library(SnowballC)
# cluster1
cluster1 <- read.csv("Cluster1.csv")
# dataframe to string
cluster1 <- toString(cluster1$MESSAGE)
cluster1
# Load Cluster1 as a Corpus
cluster1Corpus <- Corpus(VectorSource(cluster1))
cluster1Corpus
# Remove Stopwords
cluster1Corpus <- tm_map(cluster1Corpus, removeWords, stopwords('english'))
# Rmove Words
cluster1Corpus <- tm_map(cluster1Corpus, removeWords, c("can","hurricane","sandy","sandyhelp"))
# Remove punctuations
cluster1Corpus <- tm_map(cluster1Corpus, removePunctuation)
cluster1Corpus
# Word Cloud
pal <- brewer.pal(8, "Dark2")
wordcloud(cluster1Corpus, min.freq=15, max.words = 150, random.order = TRUE, col = pal)
# cluster0 Topic modeling
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(cluster1Corpus)))
dtm <- DocumentTermMatrix(cluster1Corpus[doc.lengths > 0])
model <- LDA(dtm, 4)
SEED = sample(1:1000000, 1)
k = 4
models <- list(
CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))),
VEM = LDA(dtm, k = k, control = list(seed = SEED)),
VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000))
)
lapply(models, terms, 4)
assignments <- sapply(models, topics)
assignments
----------------------------------------------------------------------------------------------------------------------
Cluster2
# Topic modeling
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(topicmodels)
library(SnowballC)
# cluster2
cluster2 <- read.csv("Cluster2.csv")
# dataframe to string
cluster2 <- toString(cluster2$MESSAGE)
cluster2
# Load Cluster0 as a Corpus
cluster2Corpus <- Corpus(VectorSource(cluster2))
cluster2Corpus
# Remove Stopwords
cluster2Corpus <- tm_map(cluster2Corpus, removeWords, stopwords('english'))
# Rmove Words
cluster2Corpus <- tm_map(cluster2Corpus, removeWords, c("can","hurricane","sandy","sandyhelp"))
# Remove punctuations
cluster2Corpus <- tm_map(cluster2Corpus, removePunctuation)
cluster2Corpus
# Word Cloud
pal <- brewer.pal(8, "Dark2")
wordcloud(cluster2Corpus, min.freq=15, max.words = 150, random.order = TRUE, col = pal)
# cluster0 Topic modeling
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(cluster2Corpus)))
dtm <- DocumentTermMatrix(cluster2Corpus[doc.lengths > 0])
model <- LDA(dtm, 4)
SEED = sample(1:1000000, 1)
k = 4
models <- list(
CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))),
VEM = LDA(dtm, k = k, control = list(seed = SEED)),
VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000))
)
lapply(models, terms, 4)
assignments <- sapply(models, topics)
assignments
------------------------------------------------------------------------------------------------------------------
沒有留言:
張貼留言