# Supplemental material to: 
#           Swimming with the Tide? Positional Claim Detection across Political Text Types.
#           In: Proceedings of the NLP+CSS workshop. Online, 2020. Accepted for publication
#           Nico Blokker, Erenay Dayanik, Gabriella Lapesa and Sebastian Padó. 

# Please cite the MARPOR project if you re-use the texts (see below): 
#           Volkens, Andrea / Burst, Tobias / Krause, Werner / Lehmann, Pola / 
#           Matthieß Theres / Merz, Nicolas / Regel, Sven / Weßels, Bernhard / 
#           Zehnter, Lisa (2020): 
#           The Manifesto Data Collection. Manifesto Project (MRG/CMP/MARPOR). Version 2020a. 
#           Berlin: Wissenschaftszentrum Berlin für Sozialforschung (WZB).
#           https://doi.org/10.25522/manifesto.mpds.2020a

# In order to download the texts you need access to the API of the MARPOR-Project
# (https://manifestoproject.wzb.eu/information/documents/api) and an API-Key 
# (register here: https://manifestoproject.wzb.eu/login). 
# In the following script we use the R package 'manifestoR' 
# (https://manifestoproject.wzb.eu/information/documents/manifestoR)
# to download the data.

# For further information regarding the coding scheme refer to Lapesa et al. 2020:
#          hdl.handle.net/11022/1007-0000-0007-DB07-B


# 0 preparation -----------------------------------------------------------

library(manifestoR)   # version 1.4.0
library(dplyr)        # version 1.0.0


# 1 load annotations ------------------------------------------------------

results <- readRDS("annotations.rds")         # annotations
sections <- readRDS("chapter_indices.rds")    # corresponding chapters


# 2 download raw text from MARPOR project ---------------------------------

API_key <- "0123456789"                                     # ENTER API KEY HERE
mp_setapikey(key = API_key)                             

# select data

programs <- mp_corpus(countryname == "Germany" & 
                                edate > as.Date("2012-12-31") & 
                                edate < as.Date("2017-12-31") & 
                                party %in% c("41113", "41223", "41320", "41521", "41953"))

# re-structure into data.frame

lookup <- function(x){
          dictionary <- c("41113" = "green",
                          "41223" = "left",
                          "41320" = "spd",
                          "41420" = "fdp",
                          "41521" = "cdu",
                          "41952" = "pirates",
                          "41953" = "afd")
          title <- names(programs[x])
          party <- dictionary[gsub("_\\d+", "", title)]
          year <- gsub("\\d+_(\\d{4}).*", "\\1", title)
          df <- data.frame(
                    party = party,
                    year = year,
                    quote = content(programs[[x]]),
                    stringsAsFactors = F,
                    row.names = NULL)
          return(df)
}
l <- lapply(1:length(programs), lookup)
df <- do.call(rbind, l) %>% arrange(quote)


# 3 merge with annotations --------------------------------------------------

results <- results %>% arrange(order_quote) %>% mutate(quote2 = df$quote) 
results <- results %>% arrange(fixed)
results <- results[sections, ] %>% select(-fixed, - order_quote)
results$claimvalues[563] <- "-401|402|999" # spd not opposing refugee protection, mislabeled
View(results)

# the resulting data.frame contains 5 variables:
#           party: party name 
#           year: election year
#           claimvalues: annotated claim-category or claim-categories (separated by "|").
#                        Negative polarity indicated by "-".
#           detected: logical, whether our models identified the text snippet as claim
#           quote2: the text snippet