Individual Exercise Solution
library(readtext) # easily read in text in directories
library(lubridate) # year function
library(quanteda) # corpus creation
library(stm) # structural topic models
# download speeches and metadata
download.file('http://erdos.ucd.ie/files/europarl/europarl-data-speeches.zip',
'europarl-data-speeches.zip')
download.file('http://erdos.ucd.ie/files/europarl/europarl-metadata.zip',
'europarl-metadata.zip')
# extract speeches and metadata
unzip('europarl-data-speeches.zip')
unzip('europarl-metadata.zip')
# recursively get filepaths for speeches from 09-12
speeches_paths <- list.files(path = c('europarl-data-speeches/2009',
'europarl-data-speeches/2010'),
recursive = T, full.names = T)
# read in speeches
speeches <- readtext(speeches_paths)
speeches <- corpus(speeches)
metadoc(speeches, field = 'type') <- 'European Parliament Speech'
# read in speech docvars
speeches_dv <- read.delim('europarl-documents-metadata.tsv', sep = '\t')
# subset metadata to 2009-20012
speeches_dv <- speeches_dv[year(speeches_dv$date) >= 2009 &
year(speeches_dv$date) <= 2010, ]
# read in MEP docvars
MEP_dv <- read.delim('europarl-meps-metadata.tsv', sep = '\t')
# merge MEP docvars onto speech metadata
dv <- merge(speeches_dv, MEP_dv, all.x = T,
by.x = 'mep_ids', by.y = 'mep_id')
# merge docvars onto corpus
docvars(speeches) <- dv
# drop any texts with missing document variables
speeches <- corpus_subset(speeches, !is.na(country_short))
## subset to 10% of corpus
speeches_sub <- corpus_sample(speeches, size = floor(ndoc(speeches) / 10))
## create vector of corpus-specific stopwords
custom_stops <- c('mr', 'mrs', 'president', 'paragraph', 'resolut', 'statement',
'gentlemen', 'ladi', 'regard', 'mention', 'point', 'debate')
# create document feature matrix from corpus
speeches_dfm <- dfm(speeches_sub, tolower = T, stem = T,
remove = c(custom_stops, stopwords('english')),
remove_punct = T)
## convert dfm to stm object
speeches_stm <- convert(speeches_dfm, to = 'stm')
## remove tokens w/ fewer than 5 appearances
speeches_stm <- prepDocuments(speeches_stm$documents, speeches_stm$vocab,
speeches_stm$meta, lower.thresh = 5)
## Removing 8684 of 12751 terms (16169 of 275578 tokens) due to frequency
## Your corpus now has 3334 documents, 4067 terms and 259409 tokens.
fit_stm_sw <- stm(documents = speeches_stm$documents, vocab = speeches_stm$vocab,
K = 15, prevalence = ~ country + group, seed = 374075,
data = speeches_stm$meta, sigma.prior = .1)
labelTopics(fit_stm_sw, n = 10)
## Topic 1 Top Words:
## Highest Prob: like, want, can, commission, think, also, one, say, question, need
## FREX: want, thing, cours, talk, realli, said, think, commission, someth, question
## Lift: davi, score, wonder, paul, realli, sure, bütikof, chris, hans-pet, kamal
## Score: say, said, want, commission, realli, talk, someth, speak, question, cours
## Topic 2 Top Words:
## Highest Prob: parliament, european, vote, agreement, 2009, resolut, rule, council, write, favour
## FREX: item, statement, vote, written, motion, agreement, request, 2009, data, mep
## Lift: part-sess, thursday, 149, item, raül, statement, ukip, wednesday, 12.00, 142
## Score: vote, agreement, data, item, 149, amend, motion, resolut, statement, written
## Topic 3 Top Words:
## Highest Prob: european, union, citizen, eu, treati, parliament, lisbon, secur, new, relat
## FREX: treati, lisbon, turkey, citizen, partnership, visa, extern, cooper, secur, role
## Lift: delic, justa, ombudsman, palecki, paliad, severin, vinca, diamandouro, enlarg, sport
## Score: treati, lisbon, turkey, visa, citizen, moldova, russia, ombudsman, cooper, partnership
## Topic 4 Top Words:
## Highest Prob: financi, crisi, fund, econom, european, budget, will, need, financ, new
## FREX: financi, budget, tax, crisi, financ, recoveri, bank, rate, fund, eur
## Lift: a7-0246, giegold, haven, supervisori, auster, budget, financi, microfin, recoveri, tax
## Score: financi, fund, budget, crisi, euro, tax, financ, eur, recoveri, bank
## Topic 5 Top Words:
## Highest Prob: will, european, council, group, parliament, member, like, us, europ, today
## FREX: presid, fellow, czech, barroso, spanish, speech, minist, togeth, council, van
## Lift: czech, belgian, corien, merkel, president-in-offic, ride, rompuy, turm, vondra, barroso
## Score: presid, council, minist, barroso, czech, applaus, greec, rompuy, spanish, say
## Topic 6 Top Words:
## Highest Prob: report, propos, committe, rapporteur, also, like, public, support, parliament, includ
## FREX: committe, rapporteur, report, propos, document, compromis, internet, recommend, opinion, affair
## Lift: andersson, göran, mander, unreason, färm, lundgren, nil, procur, draftsman, concili
## Score: report, committe, propos, rapporteur, amend, internet, procur, compromis, market, public
## Topic 7 Top Words:
## Highest Prob: right, human, european, women, protect, fundament, equal, must, freedom, state
## FREX: women, immigr, convent, discrimin, traffick, terror, asylum, sexual, gender, penalti
## Lift: asylum, barrot, detent, digniti, exhibit, frontex, guantánamo, hautala, migratori, offenc
## Score: women, human, violenc, sexual, right, traffick, terror, immigr, crime, crimin
## Topic 8 Top Words:
## Highest Prob: member, state, european, commiss, inform, system, will, legisl, legal, regul
## FREX: agenc, inform, transpar, legal, applic, administr, legisl, system, provis, patient
## Lift: a7-0215, accredit, pharmacovigil, tremopoulo, acqui, gering, leaflet, oedenberg, transposit, web
## Score: inform, legal, transpar, regul, patient, legisl, agenc, procedur, applic, state
## Topic 9 Top Words:
## Highest Prob: european, social, write, polici, region, develop, parliament, eu, ppe, pt
## FREX: pt, social, educ, region, cohes, promot, nuno, feio, diogo, poverti
## Lift: atyp, feio, hübner, melo, b7-0466, carvalho, casa, diogo, fernand, grassroot
## Score: social, pt, region, cohes, write, educ, poverti, nuno, polici, develop
## Topic 10 Top Words:
## Highest Prob: product, european, consum, food, transport, agricultur, produc, market, will, sector
## FREX: product, food, agricultur, anim, farmer, transport, farm, safeti, consum, chain
## Lift: airport, chain, freight, meat, a7-0029, a7-0225, aquacultur, aviat, b7-0208, b7-0559
## Score: product, anim, food, consum, agricultur, farmer, transport, meat, clone, safeti
## Topic 11 Top Words:
## Highest Prob: state, european, member, union, countri, aid, eu, solidar, suppli, natur
## FREX: disast, aid, south, haiti, romania, solidar, suppli, gas, baltic, north
## Lift: pipelin, disast, gulf, haiti, janusz, korean, opel, a7-0112, b6-0003, baltic
## Score: disast, haiti, gas, suppli, solidar, aid, south, russia, state, romania
## Topic 12 Top Words:
## Highest Prob: energi, develop, chang, must, climat, will, global, european, strategi, research
## FREX: climat, emiss, energi, 2020, research, global, target, effici, water, co
## Lift: b7-0536, co, deforest, millennium, 2050, a6-0495, cancún, climat, danub, dioxid
## Score: energi, climat, emiss, 2020, research, develop, effici, strategi, environment, biodivers
## Topic 13 Top Words:
## Highest Prob: countri, european, polit, parliament, intern, govern, situat, freedom, conflict, human
## FREX: regim, israel, iran, kosovo, western, balkan, georgia, iraq, conflict, civilian
## Lift: belarusian, bosnia, dictatorship, gaza, iranian, isra, kosovo, kyrgyzstan, palestinian, protest
## Score: iran, kosovo, israel, iranian, iraq, balkan, prison, gaza, peac, elect
## Topic 14 Top Words:
## Highest Prob: worker, work, compani, employ, european, job, busi, market, small, labour
## FREX: worker, compani, enterpris, fisheri, small, employe, labour, busi, redund, medium-s
## Lift: bluefin, driver, enterpris, franz, obermayr, owner, reloc, self-employ, smes, tuna
## Score: worker, compani, employ, egf, redund, enterpris, fish, labour, unemploy, medium-s
## Topic 15 Top Words:
## Highest Prob: peopl, year, europ, mani, problem, countri, world, even, live, us
## FREX: peopl, languag, year, live, mani, china, bad, noth, problem, chines
## Lift: britain, latvian, teach, twenti, eija-riitta, korhola, brother, chines, mirski, fi
## Score: peopl, languag, young, china, world, live, year, let, europ, problem