if(interactive()){
# initialize connection to Dallas database in Aster
conn = odbcDriverConnect(connection="driver={Aster ODBC Driver};
server=<dbhost>;port=2406;database=<dbname>;uid=<user>;pwd=<pw>")
# compute term-document-matrix of all 2-word Ngrams of Dallas police crime reports
# for each 4-digit zip
tdm1 = computeTfIdf(channel=conn, tableName="public.dallaspoliceall",
docId="substr(offensezip, 1, 4)",
textColumns=c("offensedescription", "offensenarrative"),
parser=nGram(2, ignoreCase=TRUE,
punctuation="[-.,?\\!:;~()]+"))
# compute term-document-matrix of all 2-word combinations of Dallas police crime reports
# for each type of offense status
tdm2 = computeTfIdf(channel=NULL, tableName="public.dallaspoliceall", docId="offensestatus",
textColumns=c("offensedescription", "offensenarrative", "offenseweather"),
parser=token(2),
where="offensestatus NOT IN ('System.Xml.XmlElement', 'C')")
# include only top 100 ranked 2-word ngrams for each 4-digit zip into resulting
# term-document-matrix using rank function
tdm3 = computeTfIdf(channel=NULL, tableName="public.dallaspoliceall",
docId="substr(offensezip, 1, 4)",
textColumns=c("offensedescription", "offensenarrative"),
parser=nGram(2), top=100)
# same but get top 10\% ranked terms using percent rank function
tdm4 = computeTfIdf(channel=NULL, tableName="public.dallaspoliceall",
docId="substr(offensezip, 1, 4)",
textColumns=c("offensedescription", "offensenarrative"),
parser=nGram(1), top=0.10, rankFunction="percentrank")
}
Run the code above in your browser using DataLab