{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Determinants of Grader Agreement"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ulrike Pado (ulrike.pado@hft-stuttgart.de) + Sebastian Pado (sebastian.pado@ims.uni-stuttgart.de), ms., 2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "General structure of this notebook: first prepare data and create models for individual corpora (=section titles), then create joint models for LA and CA corpora. For each model,\n",
    "create the full model, compare against a random-only model, and test multicollinearity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading required package: Matrix\n"
     ]
    }
   ],
   "source": [
    "library(lme4)\n",
    "library(data.table)\n",
    "library(blme)\n",
    "#library(broom.mixed)\n",
    "#library(dotwhisker)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "R version 3.3.3 (2017-03-06)\n",
       "Platform: x86_64-apple-darwin13.4.0 (64-bit)\n",
       "Running under: macOS  10.15.7\n",
       "\n",
       "locale:\n",
       "[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n",
       "\n",
       "attached base packages:\n",
       "[1] stats     graphics  grDevices utils     datasets  methods   base     \n",
       "\n",
       "other attached packages:\n",
       "[1] blme_1.0-4        data.table_1.11.4 lme4_1.1-21       Matrix_1.2-8     \n",
       "\n",
       "loaded via a namespace (and not attached):\n",
       " [1] Rcpp_1.0.2          lattice_0.20-34     digest_0.6.12      \n",
       " [4] crayon_1.3.4        MASS_7.3-45         IRdisplay_0.4.4    \n",
       " [7] repr_0.12.0         grid_3.3.3          R6_2.2.2           \n",
       "[10] nlme_3.1-131        jsonlite_1.5        magrittr_1.5       \n",
       "[13] evaluate_0.10.1     stringi_1.1.5       uuid_0.1-2         \n",
       "[16] minqa_1.2.4         nloptr_1.0.4        boot_1.3-18        \n",
       "[19] IRkernel_0.8.7.9000 splines_3.3.3       tools_3.3.3        \n",
       "[22] stringr_1.2.0       pbdZMQ_0.2-6       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# needs lme4 >= 1.19\n",
    "sessionInfo()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Powergrading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   questionID                                       studID     language \n",
       " pg_1   : 698   pg_00c9ba67-8ac4-410d-9cf7-d5ab904276b5:  10   en:6979  \n",
       " pg_13  : 698   pg_01bf2fe6-5e20-4d53-b845-9e66c852857d:  10            \n",
       " pg_2   : 698   pg_01d6dfd8-d95c-4af4-8331-7882587b85f4:  10            \n",
       " pg_20  : 698   pg_02103392-03e5-426e-917d-29c7b9b1db3e:  10            \n",
       " pg_4   : 698   pg_022ffb90-4d50-4953-bbe1-8897530b228b:  10            \n",
       " pg_5   : 698   pg_02417ca3-c996-4fb8-84e9-602439531877:  10            \n",
       " (Other):2791   (Other)                                :6919            \n",
       "  correctness         anno1            anno2         answerLength   \n",
       " Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :  1.00  \n",
       " 1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.: 11.00  \n",
       " Median :1.0000   Median :1.0000   Median :1.0000   Median : 18.00  \n",
       " Mean   :0.8506   Mean   :0.8531   Mean   :0.8543   Mean   : 24.64  \n",
       " 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 31.00  \n",
       " Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :535.00  \n",
       "                                                                    \n",
       " questionLength      type         diffLevel         Sim        \n",
       " Min.   :34.0   content:6979   remember:6979   Min.   :0.0000  \n",
       " 1st Qu.:49.0                                  1st Qu.:0.3021  \n",
       " Median :63.0                                  Median :0.4912  \n",
       " Mean   :59.6                                  Mean   :0.4812  \n",
       " 3rd Qu.:66.0                                  3rd Qu.:0.6722  \n",
       " Max.   :88.0                                  Max.   :0.9429  \n",
       "                                                               \n",
       "   ans_homog       collection       \n",
       " Min.   :0.3013   Length:6979       \n",
       " 1st Qu.:0.3986   Class :character  \n",
       " Median :0.4218   Mode  :character  \n",
       " Mean   :0.4812                     \n",
       " 3rd Qu.:0.5505                     \n",
       " Max.   :0.8448                     \n",
       "                                    "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pg <- data.table(read.csv(\"data//Powergrading.txt\",sep=\"\\t\"))\n",
    "pg$questionID = as.factor(paste(\"pg\",pg$questionID,sep=\"_\"))\n",
    "pg$studID = as.factor(paste(\"pg\",pg$studID,sep=\"_\"))\n",
    "pg$collection <- \"research\"\n",
    "pg <- na.omit(pg) # remove 1 datapoint  with Sim==NA\n",
    "summary(pg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\n",
       "   0    1 \n",
       " 286 6693 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "3.15282567228079"
      ],
      "text/latex": [
       "3.15282567228079"
      ],
      "text/markdown": [
       "3.15282567228079"
      ],
      "text/plain": [
       "[1] 3.152826"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#Compute agreement of PG corpus\n",
    "pg$agree <- 1 - abs(pg$anno1-pg$anno2)\n",
    "#hist(pg$agree)\n",
    "table(pg$agree)\n",
    "p_0 <- nrow(pg[pg$agree==1,])/nrow(pg)\n",
    "log(p_0/(1-p_0))\n",
    "pg$corpus <- \"pg\"\n",
    "# load asap just to have the # of datapoints\n",
    "asap <- read.csv(\"data/ASAP_train.txt\",sep=\"\\t\")\n",
    "pg$weights <- round(nrow(asap)/nrow(pg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>0</dt>\n",
       "\t\t<dd>1043</dd>\n",
       "\t<dt>1</dt>\n",
       "\t\t<dd>5936</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[0] 1043\n",
       "\\item[1] 5936\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "0\n",
       ":   10431\n",
       ":   5936\n",
       "\n"
      ],
      "text/plain": [
       "   0    1 \n",
       "1043 5936 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pg$correct_fac <- '0'\n",
    "pg[pg$correctness >= 0.5,]$correct_fac <- '1'\n",
    "pg$correct_fac <- as.factor(pg$correct_fac)\n",
    "summary(pg$correct_fac)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Compute normalized answer and question length\n",
    "\n",
    "pg$alnorm <- scale(log(pg$answerLength +1))\n",
    "pg$qlnorm <- scale(log(pg$questionLength +1))\n",
    "#hist(pg$alnorm)\n",
    "#hist(pg$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# compute per-question standard deviation of similarity and normalize\n",
    "\n",
    "pg$relsim <- pg$Sim - pg$ans_homog\n",
    "#pg$relsim\n",
    "per_q_sd <- pg[, sd(relsim), by=questionID]\n",
    "qid_idx <- which(colnames(pg) == \"questionID\")\n",
    "rs_idx <- which(colnames(pg) == \"relsim\")\n",
    "pg$simdevnorm <- apply(pg, 1, function(row) {\n",
    "    relsim <- as.numeric(row[rs_idx])\n",
    "    qid <- row[qid_idx]\n",
    "    relsim / per_q_sd[questionID == qid]$V1\n",
    "    })\n",
    "rm(qid_idx,rs_idx,per_q_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>low</dt>\n",
       "\t\t<dd>1974</dd>\n",
       "\t<dt>mid</dt>\n",
       "\t\t<dd>2212</dd>\n",
       "\t<dt>high</dt>\n",
       "\t\t<dd>2793</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[low] 1974\n",
       "\\item[mid] 2212\n",
       "\\item[high] 2793\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "low\n",
       ":   1974mid\n",
       ":   2212high\n",
       ":   2793\n",
       "\n"
      ],
      "text/plain": [
       " low  mid high \n",
       "1974 2212 2793 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# bin normalized similarities\n",
    "\n",
    "pg$simCat <- \"mid\"\n",
    "pg[pg$simdevnorm >= 0.5,]$simCat <- \"high\"\n",
    "pg[pg$simdevnorm <= -0.5,]$simCat <- \"low\"\n",
    "pg$simCat <- factor(pg$simCat, levels=c(\"low\",\"mid\",\"high\"))\n",
    "summary(pg$simCat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fit an LMER for PG\n",
    "# no difficulty levels, b/c all PG is 'remember'\n",
    "\n",
    "pgmodel <- bglmer(agree ~ \n",
    "                  alnorm + \n",
    "                  simCat +\n",
    "                  ans_homog +\n",
    "                  correct_fac +\n",
    "                  (1|questionID) + \n",
    "                  (1|studID),\n",
    "                 pg,                 \n",
    "                family = \"binomial\", control = glmerControl(optimizer = \"bobyqa\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(pgmodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 3.0226\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: \n",
       "agree ~ alnorm + simCat + ans_homog + correct_fac + (1 | questionID) +  \n",
       "    (1 | studID)\n",
       "   Data: pg\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  1796.7   1851.5   -890.3   1780.7     6971 \n",
       "\n",
       "Scaled residuals: \n",
       "     Min       1Q   Median       3Q      Max \n",
       "-14.6357   0.0618   0.0835   0.1747   1.0152 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 0.1255   0.3543  \n",
       " questionID (Intercept) 1.0620   1.0305  \n",
       "Number of obs: 6979, groups:  studID, 698; questionID, 10\n",
       "\n",
       "Fixed effects:\n",
       "             Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)  -0.33522    1.23073  -0.272   0.7853    \n",
       "alnorm       -0.02237    0.07173  -0.312   0.7552    \n",
       "simCatmid     1.70473    0.23048   7.396 1.40e-13 ***\n",
       "simCathigh    1.55575    0.22542   6.902 5.14e-12 ***\n",
       "ans_homog     4.46705    2.53127   1.765   0.0776 .  \n",
       "correct_fac1  1.68223    0.17178   9.793  < 2e-16 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg ans_hm\n",
       "alnorm      -0.025                            \n",
       "simCatmid    0.021  0.128                     \n",
       "simCathigh  -0.014  0.379  0.296              \n",
       "ans_homog   -0.959  0.022 -0.031  0.012       \n",
       "correct_fc1 -0.038 -0.290 -0.302 -0.462  0.002"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(pgmodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fit a random effects-only model for PG for comparison to fuller model\n",
    "\n",
    "pgmodel_empty <- bglmer(agree ~\n",
    "                       (1|questionID) +\n",
    "                       (1|studID),\n",
    "                       data = pg,\n",
    "                       family = \"binomial\", \n",
    "                       control = glmerControl(optimizer = \"bobyqa\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 3.2309\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ (1 | questionID) + (1 | studID)\n",
       "   Data: pg\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  2207.9   2228.5  -1101.0   2201.9     6976 \n",
       "\n",
       "Scaled residuals: \n",
       "     Min       1Q   Median       3Q      Max \n",
       "-16.2824   0.1263   0.1475   0.2676   0.4078 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 0.08722  0.2953  \n",
       " questionID (Intercept) 1.33024  1.1534  \n",
       "Number of obs: 6979, groups:  studID, 698; questionID, 10\n",
       "\n",
       "Fixed effects:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)   3.6358     0.3786   9.602   <2e-16 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(pgmodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>pgmodel_empty</th><td>3           </td><td>2207.919    </td><td>2228.471    </td><td>-1100.9594  </td><td>2201.919    </td><td>      NA    </td><td>NA          </td><td>          NA</td></tr>\n",
       "\t<tr><th scope=row>pgmodel</th><td>8           </td><td>1796.689    </td><td>1851.494    </td><td> -890.3443  </td><td>1780.689    </td><td>421.2302    </td><td> 5          </td><td>7.865091e-89</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tpgmodel\\_empty & 3            & 2207.919     & 2228.471     & -1100.9594   & 2201.919     &       NA     & NA           &           NA\\\\\n",
       "\tpgmodel & 8            & 1796.689     & 1851.494     &  -890.3443   & 1780.689     & 421.2302     &  5           & 7.865091e-89\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|\n",
       "| pgmodel_empty | 3            | 2207.919     | 2228.471     | -1100.9594   | 2201.919     |       NA     | NA           |           NA | \n",
       "| pgmodel | 8            | 1796.689     | 1851.494     |  -890.3443   | 1780.689     | 421.2302     |  5           | 7.865091e-89 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "              Df AIC      BIC      logLik     deviance Chisq    Chi Df\n",
       "pgmodel_empty 3  2207.919 2228.471 -1100.9594 2201.919       NA NA    \n",
       "pgmodel       8  1796.689 1851.494  -890.3443 1780.689 421.2302  5    \n",
       "              Pr(>Chisq)  \n",
       "pgmodel_empty           NA\n",
       "pgmodel       7.865091e-89"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(pgmodel_empty, pgmodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the nonempty model is much better"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>1.191414</td><td>1       </td><td>1.091519</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.442921</td><td>2       </td><td>1.096000</td></tr>\n",
       "\t<tr><th scope=row>ans_homog</th><td>1.001884</td><td>1       </td><td>1.000941</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>1.346782</td><td>1       </td><td>1.160509</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 1.191414 & 1        & 1.091519\\\\\n",
       "\tsimCat & 1.442921 & 2        & 1.096000\\\\\n",
       "\tans\\_homog & 1.001884 & 1        & 1.000941\\\\\n",
       "\tcorrect\\_fac & 1.346782 & 1        & 1.160509\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|\n",
       "| alnorm | 1.191414 | 1        | 1.091519 | \n",
       "| simCat | 1.442921 | 2        | 1.096000 | \n",
       "| ans_homog | 1.001884 | 1        | 1.000941 | \n",
       "| correct_fac | 1.346782 | 1        | 1.160509 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "            GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm      1.191414 1  1.091519       \n",
       "simCat      1.442921 2  1.096000       \n",
       "ans_homog   1.001884 1  1.000941       \n",
       "correct_fac 1.346782 1  1.160509       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# test multicollinearity\n",
    "\n",
    "car::vif(pgmodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "No collinearity problems"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CREE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "         questionID          studID    language  correctness    \n",
       " cree_DU3C6R21: 17   cree_AU066 : 47   en:566   Min.   :0.0000  \n",
       " cree_DU3C6R22: 17   cree_AU068 : 47            1st Qu.:0.0000  \n",
       " cree_DU3C6R23: 17   cree_AU061 : 46            Median :1.0000  \n",
       " cree_DU3C6R24: 17   cree_AU063 : 39            Mean   :0.7226  \n",
       " cree_DU3C6R26: 17   cree_AU067 : 28            3rd Qu.:1.0000  \n",
       " cree_DU3C6R27: 17   cree_SP0713: 28            Max.   :1.0000  \n",
       " (Other)      :464   (Other)    :331                            \n",
       "     anno1            anno2        answerLength    questionLength  \n",
       " Min.   :0.0000   Min.   :0.000   Min.   :  5.00   Min.   : 21.00  \n",
       " 1st Qu.:0.0000   1st Qu.:0.000   1st Qu.: 74.25   1st Qu.: 46.00  \n",
       " Median :1.0000   Median :1.000   Median :120.00   Median : 64.00  \n",
       " Mean   :0.7226   Mean   :0.742   Mean   :135.00   Mean   : 64.57  \n",
       " 3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:178.00   3rd Qu.: 78.00  \n",
       " Max.   :1.0000   Max.   :1.000   Max.   :543.00   Max.   :162.00  \n",
       "                                                                   \n",
       "       type              diffLevel        Sim            ans_homog     \n",
       " language:566   literal       :472   Min.   :0.01613   Min.   :0.1262  \n",
       "                reorganization: 31   1st Qu.:0.34929   1st Qu.:0.4176  \n",
       "                inference     : 63   Median :0.50592   Median :0.4957  \n",
       "                                     Mean   :0.52396   Mean   :0.5240  \n",
       "                                     3rd Qu.:0.67311   3rd Qu.:0.6272  \n",
       "                                     Max.   :1.00000   Max.   :0.9677  \n",
       "                                                                       \n",
       "  collection       \n",
       " Length:566        \n",
       " Class :character  \n",
       " Mode  :character  \n",
       "                   \n",
       "                   \n",
       "                   \n",
       "                   "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cree <- data.table(read.csv(\"data/CREE.txt\",sep=\"\\t\"))\n",
    "cree$studID <- as.factor(paste(\"cree\",cree$studID,sep=\"_\"))\n",
    "cree$questionID <- as.factor(paste(\"cree\",cree$questionID,sep=\"_\"))\n",
    "cree$diffLevel <- factor(cree$diffLevel, levels=c(\"literal\",\"reorganization\",\"inference\"))\n",
    "cree$collection <- \"classroom\"\n",
    "summary(cree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. \n",
       " 0.0000  1.0000  1.0000  0.8604  1.0000  1.0000 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "  0   1 \n",
       " 79 487 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cree$agree <- 1- abs(cree$anno1-cree$anno2)\n",
    "summary(cree$agree)\n",
    "table(as.factor(cree$agree))\n",
    "cree$corpus <- \"cree\"\n",
    "cree$weights <- round(nrow(asap)/nrow(cree))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>0</dt>\n",
       "\t\t<dd>157</dd>\n",
       "\t<dt>1</dt>\n",
       "\t\t<dd>409</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[0] 157\n",
       "\\item[1] 409\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "0\n",
       ":   1571\n",
       ":   409\n",
       "\n"
      ],
      "text/plain": [
       "  0   1 \n",
       "157 409 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cree$correct_fac <- '0'\n",
    "cree[cree$correctness >= 0.5,]$correct_fac <- '1'\n",
    "cree$correct_fac <- as.factor(cree$correct_fac)\n",
    "summary(cree$correct_fac)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "cree$alnorm <- scale(log(cree$answerLength +1))\n",
    "cree$qlnorm <- scale(log(cree$questionLength +1))\n",
    "#hist(cree$alnorm)\n",
    "#hist(cree$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# compute per-question standard deviation of similarity and normalize\n",
    "\n",
    "cree$relsim <- cree$Sim - cree$ans_homog\n",
    "#pg$relsim\n",
    "per_q_sd <- cree[, sd(relsim), by=questionID]\n",
    "qid_idx <- which(colnames(cree) == \"questionID\")\n",
    "rs_idx <- which(colnames(cree) == \"relsim\")\n",
    "cree$simdevnorm <- apply(cree, 1, function(row) {\n",
    "    relsim <- as.numeric(row[rs_idx])\n",
    "    qid <- row[qid_idx]\n",
    "    relsim / per_q_sd[questionID == qid]$V1\n",
    "    })\n",
    "rm(qid_idx,rs_idx,per_q_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>low</dt>\n",
       "\t\t<dd>172</dd>\n",
       "\t<dt>mid</dt>\n",
       "\t\t<dd>194</dd>\n",
       "\t<dt>high</dt>\n",
       "\t\t<dd>200</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[low] 172\n",
       "\\item[mid] 194\n",
       "\\item[high] 200\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "low\n",
       ":   172mid\n",
       ":   194high\n",
       ":   200\n",
       "\n"
      ],
      "text/plain": [
       " low  mid high \n",
       " 172  194  200 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# bin normalized similarities\n",
    "\n",
    "cree$simCat <- \"mid\"\n",
    "cree[cree$simdevnorm >= 0.5,]$simCat <- \"high\"\n",
    "cree[cree$simdevnorm <= -0.5,]$simCat <- \"low\"\n",
    "cree$simCat <- factor(cree$simCat, levels=c(\"low\",\"mid\",\"high\"))\n",
    "summary(cree$simCat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LMER for CREE\n",
    "\n",
    "creemodel <- bglmer(agree ~ \n",
    "                   alnorm +      \n",
    "                   simCat +\n",
    "                   diffLevel * correct_fac + \n",
    "                   scale(ans_homog) +\n",
    "                   (1|questionID) + \n",
    "                   (1|studID),\n",
    "                  cree,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(creemodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 2.8361\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: \n",
       "agree ~ alnorm + simCat + diffLevel * correct_fac + scale(ans_homog) +  \n",
       "    (1 | questionID) + (1 | studID)\n",
       "   Data: cree\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "   419.7    471.8   -197.9    395.7      554 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-5.4065  0.1440  0.2158  0.3676  1.8091 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 1.58483  1.2589  \n",
       " studID     (Intercept) 0.09525  0.3086  \n",
       "Number of obs: 566, groups:  questionID, 61; studID, 26\n",
       "\n",
       "Fixed effects:\n",
       "                                     Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                           1.35578    0.38026   3.565 0.000363 ***\n",
       "alnorm                               -0.11692    0.18714  -0.625 0.532127    \n",
       "simCatmid                             0.03778    0.37813   0.100 0.920405    \n",
       "simCathigh                           -0.27177    0.37378  -0.727 0.467175    \n",
       "diffLevelreorganization               0.08129    1.29279   0.063 0.949861    \n",
       "diffLevelinference                   -0.01024    1.02002  -0.010 0.991986    \n",
       "correct_fac1                          1.71193    0.38282   4.472 7.75e-06 ***\n",
       "scale(ans_homog)                      0.44963    0.25486   1.764 0.077690 .  \n",
       "diffLevelreorganization:correct_fac1 -0.61273    1.63558  -0.375 0.707937    \n",
       "diffLevelinference:correct_fac1       0.80644    1.11000   0.727 0.467516    \n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg dffLvlr dffLvln crrc_1 scl(_)\n",
       "alnorm       0.053                                                   \n",
       "simCatmid   -0.342  0.162                                            \n",
       "simCathigh  -0.367  0.388  0.528                                     \n",
       "dffLvlrrgnz -0.191  0.046  0.017  0.034                              \n",
       "dffLvlnfrnc -0.181 -0.045 -0.129 -0.056  0.073                       \n",
       "correct_fc1 -0.344 -0.323 -0.231 -0.243  0.116   0.210               \n",
       "scl(ns_hmg)  0.150 -0.032  0.022 -0.035  0.004   0.124  -0.069       \n",
       "dffLvlrr:_1  0.106 -0.090 -0.001 -0.051 -0.718  -0.021  -0.177  0.124\n",
       "dffLvlnf:_1  0.102  0.023  0.140  0.074 -0.040  -0.620  -0.325  0.038\n",
       "            dffLvlr:_1\n",
       "alnorm                \n",
       "simCatmid             \n",
       "simCathigh            \n",
       "dffLvlrrgnz           \n",
       "dffLvlnfrnc           \n",
       "correct_fc1           \n",
       "scl(ns_hmg)           \n",
       "dffLvlrr:_1           \n",
       "dffLvlnf:_1  0.071    "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(creemodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# random-only model for CREE\n",
    "\n",
    "creemodel_empty <- bglmer(agree ~ (1|questionID) + (1|studID),\n",
    "                  data = cree,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 2.0738\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ (1 | questionID) + (1 | studID)\n",
       "   Data: cree\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "   439.2    452.2   -216.6    433.2      563 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-3.6014  0.2000  0.2610  0.3608  1.0789 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 1.8799   1.3711  \n",
       " studID     (Intercept) 0.1335   0.3654  \n",
       "Number of obs: 566, groups:  questionID, 61; studID, 26\n",
       "\n",
       "Fixed effects:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)    2.388      0.300   7.961  1.7e-15 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(creemodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>creemodel_empty</th><td> 3          </td><td>439.2234    </td><td>452.2391    </td><td>-216.6117   </td><td>433.2234    </td><td>      NA    </td><td>NA          </td><td>          NA</td></tr>\n",
       "\t<tr><th scope=row>creemodel</th><td>12          </td><td>419.7139    </td><td>471.7770    </td><td>-197.8569   </td><td>395.7139    </td><td>37.50949    </td><td> 9          </td><td>2.134108e-05</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tcreemodel\\_empty &  3           & 439.2234     & 452.2391     & -216.6117    & 433.2234     &       NA     & NA           &           NA\\\\\n",
       "\tcreemodel & 12           & 419.7139     & 471.7770     & -197.8569    & 395.7139     & 37.50949     &  9           & 2.134108e-05\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|\n",
       "| creemodel_empty |  3           | 439.2234     | 452.2391     | -216.6117    | 433.2234     |       NA     | NA           |           NA | \n",
       "| creemodel | 12           | 419.7139     | 471.7770     | -197.8569    | 395.7139     | 37.50949     |  9           | 2.134108e-05 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                Df AIC      BIC      logLik    deviance Chisq    Chi Df\n",
       "creemodel_empty  3 439.2234 452.2391 -216.6117 433.2234       NA NA    \n",
       "creemodel       12 419.7139 471.7770 -197.8569 395.7139 37.50949  9    \n",
       "                Pr(>Chisq)  \n",
       "creemodel_empty           NA\n",
       "creemodel       2.134108e-05"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(creemodel_empty, creemodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Result: random-only model significantly worse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>1.305602</td><td>1       </td><td>1.142629</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.247848</td><td>2       </td><td>1.056916</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>3.577166</td><td>2       </td><td>1.375260</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>1.355495</td><td>1       </td><td>1.164257</td></tr>\n",
       "\t<tr><th scope=row>scale(ans_homog)</th><td>1.073423</td><td>1       </td><td>1.036061</td></tr>\n",
       "\t<tr><th scope=row>diffLevel:correct_fac</th><td>3.932768</td><td>2       </td><td>1.408233</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 1.305602 & 1        & 1.142629\\\\\n",
       "\tsimCat & 1.247848 & 2        & 1.056916\\\\\n",
       "\tdiffLevel & 3.577166 & 2        & 1.375260\\\\\n",
       "\tcorrect\\_fac & 1.355495 & 1        & 1.164257\\\\\n",
       "\tscale(ans\\_homog) & 1.073423 & 1        & 1.036061\\\\\n",
       "\tdiffLevel:correct\\_fac & 3.932768 & 2        & 1.408233\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|---|---|\n",
       "| alnorm | 1.305602 | 1        | 1.142629 | \n",
       "| simCat | 1.247848 | 2        | 1.056916 | \n",
       "| diffLevel | 3.577166 | 2        | 1.375260 | \n",
       "| correct_fac | 1.355495 | 1        | 1.164257 | \n",
       "| scale(ans_homog) | 1.073423 | 1        | 1.036061 | \n",
       "| diffLevel:correct_fac | 3.932768 | 2        | 1.408233 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                      GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm                1.305602 1  1.142629       \n",
       "simCat                1.247848 2  1.056916       \n",
       "diffLevel             3.577166 2  1.375260       \n",
       "correct_fac           1.355495 1  1.164257       \n",
       "scale(ans_homog)      1.073423 1  1.036061       \n",
       "diffLevel:correct_fac 3.932768 2  1.408233       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# test multicollinearity\n",
    "\n",
    "car::vif(creemodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "No collinearity problems even with the interaction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CREG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     questionID        studID     language   correctness         anno1       \n",
       " creg_2068:  97   creg_110:  44   de:4384   Min.   :0.0000   Min.   :0.0000  \n",
       " creg_2069:  85   creg_220:  42             1st Qu.:0.0000   1st Qu.:0.0000  \n",
       " creg_2085:  67   creg_230:  42             Median :1.0000   Median :1.0000  \n",
       " creg_2088:  65   creg_368:  42             Mean   :0.7126   Mean   :0.7126  \n",
       " creg_2087:  64   creg_231:  39             3rd Qu.:1.0000   3rd Qu.:1.0000  \n",
       " creg_2082:  63   creg_232:  39             Max.   :1.0000   Max.   :1.0000  \n",
       " (Other)  :3943   (Other) :4136                                              \n",
       "     anno2         answerLength    questionLength       type     \n",
       " Min.   :0.0000   Min.   : 15.00   Min.   : 19    language:4384  \n",
       " 1st Qu.:1.0000   1st Qu.: 39.00   1st Qu.: 40                   \n",
       " Median :1.0000   Median : 56.00   Median : 54                   \n",
       " Mean   :0.7888   Mean   : 68.38   Mean   : 61                   \n",
       " 3rd Qu.:1.0000   3rd Qu.: 87.00   3rd Qu.: 71                   \n",
       " Max.   :1.0000   Max.   :643.00   Max.   :169                   \n",
       "                                                                 \n",
       "          diffLevel         Sim           ans_homog        collection       \n",
       " literal       :3552   Min.   :0.0000   Min.   :0.09764   Length:4384       \n",
       " reorganization: 581   1st Qu.:0.2752   1st Qu.:0.32134   Class :character  \n",
       " inference     : 251   Median :0.3869   Median :0.38051   Mode  :character  \n",
       "                       Mean   :0.4183   Mean   :0.41834                     \n",
       "                       3rd Qu.:0.5325   3rd Qu.:0.48684                     \n",
       "                       Max.   :0.9944   Max.   :0.94908                     \n",
       "                                                                            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "creg <- data.table(read.csv(\"data/CREG.txt\",sep=\"\\t\"))\n",
    "creg$studID <- as.factor(paste(\"creg\",creg$studID,sep=\"_\"))\n",
    "creg$questionID <- as.factor(paste(\"creg\",creg$questionID,sep=\"_\"))\n",
    "creg$diffLevel <- factor(creg$diffLevel, levels=c(\"literal\",\"reorganization\",\"inference\"))\n",
    "creg <- na.omit(creg)\n",
    "creg$collection <- \"classroom\"\n",
    "summary(creg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. \n",
       " 0.0000  1.0000  1.0000  0.8631  1.0000  1.0000 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "   0    1 \n",
       " 600 3784 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "creg$agree <- 1 - abs(creg$anno1-creg$anno2)\n",
    "summary(creg$agree)\n",
    "table(as.factor(creg$agree))\n",
    "creg$corpus <- \"creg\"\n",
    "creg$weights <- round(nrow(asap)/nrow(creg))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>0</dt>\n",
       "\t\t<dd>1260</dd>\n",
       "\t<dt>1</dt>\n",
       "\t\t<dd>3124</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[0] 1260\n",
       "\\item[1] 3124\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "0\n",
       ":   12601\n",
       ":   3124\n",
       "\n"
      ],
      "text/plain": [
       "   0    1 \n",
       "1260 3124 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "creg$correct_fac <- '0'\n",
    "creg[creg$correctness >= 0.5,]$correct_fac <- '1'\n",
    "creg$correct_fac <- as.factor(creg$correct_fac)\n",
    "summary(creg$correct_fac)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "creg$alnorm <- scale(log(creg$answerLength +1))\n",
    "creg$qlnorm <- scale(log(creg$questionLength +1))\n",
    "#hist(creg$alnorm)\n",
    "#hist(creg$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "creg$relsim <- creg$Sim - creg$ans_homog\n",
    "#pg$relsim\n",
    "per_q_sd <- creg[, sd(relsim), by=questionID]\n",
    "qid_idx <- which(colnames(creg) == \"questionID\")\n",
    "rs_idx <- which(colnames(creg) == \"relsim\")\n",
    "creg$simdevnorm <- apply(creg, 1, function(row) {\n",
    "    relsim <- as.numeric(row[rs_idx])\n",
    "    qid <- row[qid_idx]\n",
    "    relsim / per_q_sd[questionID == qid]$V1\n",
    "    })\n",
    "rm(qid_idx,rs_idx,per_q_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>low</dt>\n",
       "\t\t<dd>1251</dd>\n",
       "\t<dt>mid</dt>\n",
       "\t\t<dd>1573</dd>\n",
       "\t<dt>high</dt>\n",
       "\t\t<dd>1560</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[low] 1251\n",
       "\\item[mid] 1573\n",
       "\\item[high] 1560\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "low\n",
       ":   1251mid\n",
       ":   1573high\n",
       ":   1560\n",
       "\n"
      ],
      "text/plain": [
       " low  mid high \n",
       "1251 1573 1560 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# bin normalized similarity\n",
    "\n",
    "creg$simCat <- \"mid\"\n",
    "creg[creg$simdevnorm >= 0.5,]$simCat <- \"high\"\n",
    "creg[creg$simdevnorm <= -0.5,]$simCat <- \"low\"\n",
    "creg$simCat <- factor(creg$simCat, levels=c(\"low\",\"mid\",\"high\"))\n",
    "summary(creg$simCat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "cregmodel <- bglmer(agree ~ \n",
    "                   alnorm + \n",
    "                   simCat +\n",
    "                   diffLevel * correct_fac + \n",
    "                   scale(ans_homog) +\n",
    "                   (1|questionID) + \n",
    "                   (1|studID),\n",
    "                  data = creg,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(creemodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 1.5362\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: \n",
       "agree ~ alnorm + simCat + diffLevel * correct_fac + scale(ans_homog) +  \n",
       "    (1 | questionID) + (1 | studID)\n",
       "   Data: creg\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  2366.7   2443.3  -1171.4   2342.7     4372 \n",
       "\n",
       "Scaled residuals: \n",
       "     Min       1Q   Median       3Q      Max \n",
       "-25.3046   0.0718   0.1413   0.2492   6.7608 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 0.1384   0.372   \n",
       " questionID (Intercept) 2.5943   1.611   \n",
       "Number of obs: 4384, groups:  studID, 384; questionID, 163\n",
       "\n",
       "Fixed effects:\n",
       "                                     Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                           1.08292    0.20115   5.384 7.30e-08 ***\n",
       "alnorm                               -0.28155    0.08361  -3.367 0.000759 ***\n",
       "simCatmid                            -0.78207    0.15009  -5.211 1.88e-07 ***\n",
       "simCathigh                           -0.88607    0.16897  -5.244 1.57e-07 ***\n",
       "diffLevelreorganization              -0.09004    0.48851  -0.184 0.853767    \n",
       "diffLevelinference                    1.20356    0.63236   1.903 0.057004 .  \n",
       "correct_fac1                          4.03759    0.19945  20.243  < 2e-16 ***\n",
       "scale(ans_homog)                     -0.28557    0.15694  -1.820 0.068820 .  \n",
       "diffLevelreorganization:correct_fac1  0.18632    0.40653   0.458 0.646724    \n",
       "diffLevelinference:correct_fac1      -2.90088    0.50645  -5.728 1.02e-08 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg dffLvlr dffLvln crrc_1 scl(_)\n",
       "alnorm       0.025                                                   \n",
       "simCatmid   -0.306  0.098                                            \n",
       "simCathigh  -0.256  0.304  0.538                                     \n",
       "dffLvlrrgnz -0.338  0.031  0.011  0.025                              \n",
       "dffLvlnfrnc -0.253  0.059 -0.016 -0.001  0.138                       \n",
       "correct_fc1 -0.096 -0.259 -0.308 -0.389  0.083   0.062               \n",
       "scl(ns_hmg) -0.053  0.082  0.061  0.076  0.190   0.136  -0.188       \n",
       "dffLvlrr:_1  0.134 -0.007 -0.015  0.008 -0.247  -0.032  -0.345  0.049\n",
       "dffLvlnf:_1  0.065  0.056  0.057  0.057 -0.035  -0.391  -0.342  0.059\n",
       "            dffLvlr:_1\n",
       "alnorm                \n",
       "simCatmid             \n",
       "simCathigh            \n",
       "dffLvlrrgnz           \n",
       "dffLvlnfrnc           \n",
       "correct_fc1           \n",
       "scl(ns_hmg)           \n",
       "dffLvlrr:_1           \n",
       "dffLvlnf:_1  0.135    "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(cregmodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "cregmodel_empty <- bglmer(agree ~                    \n",
    "                   (1|questionID) + \n",
    "                   (1|studID),\n",
    "                   data = creg,\n",
    "                   family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(creemodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 1.8581\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ (1 | questionID) + (1 | studID)\n",
       "   Data: creg\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  3191.8   3211.0  -1592.9   3185.8     4381 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-5.1270  0.1836  0.2604  0.3935  1.3991 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 0.1992   0.4463  \n",
       " questionID (Intercept) 1.4547   1.2061  \n",
       "Number of obs: 4384, groups:  studID, 384; questionID, 163\n",
       "\n",
       "Fixed effects:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)   2.4827     0.1357    18.3   <2e-16 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(cregmodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>cregmodel_empty</th><td> 3           </td><td>3191.819     </td><td>3210.976     </td><td>-1592.909    </td><td>3185.819     </td><td>      NA     </td><td>NA           </td><td>           NA</td></tr>\n",
       "\t<tr><th scope=row>cregmodel</th><td>12           </td><td>2366.712     </td><td>2443.341     </td><td>-1171.356    </td><td>2342.712     </td><td>843.1065     </td><td> 9           </td><td>1.113548e-175</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tcregmodel\\_empty &  3            & 3191.819      & 3210.976      & -1592.909     & 3185.819      &       NA      & NA            &            NA\\\\\n",
       "\tcregmodel & 12            & 2366.712      & 2443.341      & -1171.356     & 2342.712      & 843.1065      &  9            & 1.113548e-175\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|\n",
       "| cregmodel_empty |  3            | 3191.819      | 3210.976      | -1592.909     | 3185.819      |       NA      | NA            |            NA | \n",
       "| cregmodel | 12            | 2366.712      | 2443.341      | -1171.356     | 2342.712      | 843.1065      |  9            | 1.113548e-175 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                Df AIC      BIC      logLik    deviance Chisq    Chi Df\n",
       "cregmodel_empty  3 3191.819 3210.976 -1592.909 3185.819       NA NA    \n",
       "cregmodel       12 2366.712 2443.341 -1171.356 2342.712 843.1065  9    \n",
       "                Pr(>Chisq)   \n",
       "cregmodel_empty            NA\n",
       "cregmodel       1.113548e-175"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(cregmodel_empty, cregmodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>1.158705</td><td>1       </td><td>1.076431</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.318893</td><td>2       </td><td>1.071649</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>1.361678</td><td>2       </td><td>1.080236</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>1.657858</td><td>1       </td><td>1.287578</td></tr>\n",
       "\t<tr><th scope=row>scale(ans_homog)</th><td>1.106766</td><td>1       </td><td>1.052030</td></tr>\n",
       "\t<tr><th scope=row>diffLevel:correct_fac</th><td>1.653861</td><td>2       </td><td>1.134031</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 1.158705 & 1        & 1.076431\\\\\n",
       "\tsimCat & 1.318893 & 2        & 1.071649\\\\\n",
       "\tdiffLevel & 1.361678 & 2        & 1.080236\\\\\n",
       "\tcorrect\\_fac & 1.657858 & 1        & 1.287578\\\\\n",
       "\tscale(ans\\_homog) & 1.106766 & 1        & 1.052030\\\\\n",
       "\tdiffLevel:correct\\_fac & 1.653861 & 2        & 1.134031\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|---|---|\n",
       "| alnorm | 1.158705 | 1        | 1.076431 | \n",
       "| simCat | 1.318893 | 2        | 1.071649 | \n",
       "| diffLevel | 1.361678 | 2        | 1.080236 | \n",
       "| correct_fac | 1.657858 | 1        | 1.287578 | \n",
       "| scale(ans_homog) | 1.106766 | 1        | 1.052030 | \n",
       "| diffLevel:correct_fac | 1.653861 | 2        | 1.134031 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                      GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm                1.158705 1  1.076431       \n",
       "simCat                1.318893 2  1.071649       \n",
       "diffLevel             1.361678 2  1.080236       \n",
       "correct_fac           1.657858 1  1.287578       \n",
       "scale(ans_homog)      1.106766 1  1.052030       \n",
       "diffLevel:correct_fac 1.653861 2  1.134031       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "car::vif(cregmodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Collinearity very good, all < 1.3."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ASAP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   questionID         studID      language    correctness         anno1       \n",
       " asap_3 :1891   asap_1   :    1   en:17207   Min.   :0.0000   Min.   :0.0000  \n",
       " asap_7 :1799   asap_10  :    1              1st Qu.:0.0000   1st Qu.:0.0000  \n",
       " asap_8 :1799   asap_100 :    1              Median :0.5000   Median :0.5000  \n",
       " asap_9 :1798   asap_1000:    1              Mean   :0.4095   Mean   :0.4095  \n",
       " asap_6 :1797   asap_1001:    1              3rd Qu.:0.6667   3rd Qu.:0.6667  \n",
       " asap_5 :1795   asap_1002:    1              Max.   :1.0000   Max.   :1.0000  \n",
       " (Other):6328   (Other)  :17201                                               \n",
       "     anno2         answerLength    questionLength         type     \n",
       " Min.   :0.0000   Min.   :   1.0   Min.   :  94.0   content :8182  \n",
       " 1st Qu.:0.0000   1st Qu.: 128.0   1st Qu.: 111.0   language:9025  \n",
       " Median :0.5000   Median : 218.0   Median : 153.0                  \n",
       " Mean   :0.4086   Mean   : 236.8   Mean   : 361.9                  \n",
       " 3rd Qu.:0.6667   3rd Qu.: 319.0   3rd Qu.: 727.0                  \n",
       " Max.   :1.0000   Max.   :1819.0   Max.   :1392.0                  \n",
       "                                                                   \n",
       "          diffLevel         Sim           ans_homog       collection       \n",
       " remember      :3592   Min.   :0.0000   Min.   :0.1848   Length:17207      \n",
       " understand    :3312   1st Qu.:0.2425   1st Qu.:0.2622   Class :character  \n",
       " reorganization:3629   Median :0.3079   Median :0.3179   Mode  :character  \n",
       " inference     :5396   Mean   :0.3108   Mean   :0.3108                     \n",
       " several       :1278   3rd Qu.:0.3794   3rd Qu.:0.3505                     \n",
       "                       Max.   :0.9134   Max.   :0.4424                     \n",
       "                                                                           "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap <- data.table(read.csv(\"data/ASAP_train.txt\",sep=\"\\t\"))\n",
    "asap$studID <- as.factor(paste(\"asap\",asap$studID,sep=\"_\"))\n",
    "asap$questionID <- as.factor(paste(\"asap\",asap$questionID,sep=\"_\"))\n",
    "asap$diffLevel <- factor(asap$diffLevel, levels=c(\"remember\",\"understand\",\"reorganization\",\"inference\",\"several\"))\n",
    "asap$collection <- \"standardized\"\n",
    "summary(asap)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. \n",
       " 0.0000  1.0000  1.0000  0.8985  1.0000  1.0000 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "    0     1 \n",
       " 1747 15460 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap$agree <- as.integer(abs(asap$anno1-asap$anno2) < 0.5)\n",
    "summary(asap$agree)\n",
    "table(as.factor(asap$agree))\n",
    "asap$corpus <- \"asap\"\n",
    "asap$weights <- round(nrow(asap)/nrow(asap))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>0</dt>\n",
       "\t\t<dd>8022</dd>\n",
       "\t<dt>1</dt>\n",
       "\t\t<dd>9185</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[0] 8022\n",
       "\\item[1] 9185\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "0\n",
       ":   80221\n",
       ":   9185\n",
       "\n"
      ],
      "text/plain": [
       "   0    1 \n",
       "8022 9185 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap$correct_fac <- '0'\n",
    "asap[asap$correctness >= 0.5,]$correct_fac <- '1'\n",
    "asap$correct_fac <- as.factor(asap$correct_fac)\n",
    "summary(asap$correct_fac)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# separate by CA and LA\n",
    "\n",
    "asap_ca <- asap[asap$type==\"content\",]\n",
    "asap_la <- asap[asap$type==\"language\",]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ASAP language assessment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   questionID          studID     language   correctness         anno1       \n",
       " asap_3 :1891   asap_16954:   1   en:9025   Min.   :0.0000   Min.   :0.0000  \n",
       " asap_7 :1799   asap_16955:   1             1st Qu.:0.0000   1st Qu.:0.0000  \n",
       " asap_8 :1799   asap_16956:   1             Median :0.5000   Median :0.5000  \n",
       " asap_9 :1798   asap_16957:   1             Mean   :0.4643   Mean   :0.4643  \n",
       " asap_4 :1738   asap_16958:   1             3rd Qu.:1.0000   3rd Qu.:1.0000  \n",
       " asap_1 :   0   asap_16959:   1             Max.   :1.0000   Max.   :1.0000  \n",
       " (Other):   0   (Other)   :9019                                              \n",
       "     anno2         answerLength    questionLength        type     \n",
       " Min.   :0.0000   Min.   :   3.0   Min.   : 94.0   content :   0  \n",
       " 1st Qu.:0.0000   1st Qu.: 162.0   1st Qu.:132.0   language:9025  \n",
       " Median :0.5000   Median : 243.0   Median :153.0                  \n",
       " Mean   :0.4639   Mean   : 262.6   Mean   :146.3                  \n",
       " 3rd Qu.:1.0000   3rd Qu.: 334.0   3rd Qu.:165.0                  \n",
       " Max.   :1.0000   Max.   :1819.0   Max.   :186.0                  \n",
       "                                                                  \n",
       "          diffLevel         Sim           ans_homog       collection       \n",
       " remember      :   0   Min.   :0.0000   Min.   :0.2622   Length:9025       \n",
       " understand    :   0   1st Qu.:0.2691   1st Qu.:0.3004   Class :character  \n",
       " reorganization:3629   Median :0.3188   Median :0.3179   Mode  :character  \n",
       " inference     :5396   Mean   :0.3326   Mean   :0.3326                     \n",
       " several       :   0   3rd Qu.:0.3867   3rd Qu.:0.3345                     \n",
       "                       Max.   :0.8121   Max.   :0.4424                     \n",
       "                                                                           \n",
       "     agree           corpus             weights  correct_fac\n",
       " Min.   :0.0000   Length:9025        Min.   :1   0:3035     \n",
       " 1st Qu.:1.0000   Class :character   1st Qu.:1   1:5990     \n",
       " Median :1.0000   Mode  :character   Median :1              \n",
       " Mean   :0.8298                      Mean   :1              \n",
       " 3rd Qu.:1.0000                      3rd Qu.:1              \n",
       " Max.   :1.0000                      Max.   :1              \n",
       "                                                            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "   0    1 \n",
       "1536 7489 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# summaries for both parts of ASAP\n",
    "summary(asap_la)\n",
    "table(as.factor(as.integer(abs(asap_la$anno1-asap_la$anno2) < 0.5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "asap_la$alnorm <- scale(log(asap_la$answerLength +1))\n",
    "asap_la$qlnorm <- scale(log(asap_la$questionLength +1))\n",
    "#hist(asap_la$alnorm)\n",
    "#hist(asap_la$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "asap_la$relsim <- asap_la$Sim - asap_la$ans_homog\n",
    "#pg$relsim\n",
    "per_q_sd <- asap_la[, sd(relsim), by=questionID]\n",
    "qid_idx <- which(colnames(asap_la) == \"questionID\")\n",
    "rs_idx <- which(colnames(asap_la) == \"relsim\")\n",
    "asap_la$simdevnorm <- apply(asap_la, 1, function(row) {\n",
    "    relsim <- as.numeric(row[rs_idx])\n",
    "    qid <- row[qid_idx]\n",
    "    relsim / per_q_sd[questionID == qid]$V1\n",
    "    })\n",
    "rm(qid_idx,rs_idx,per_q_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>low</dt>\n",
       "\t\t<dd>2854</dd>\n",
       "\t<dt>mid</dt>\n",
       "\t\t<dd>3625</dd>\n",
       "\t<dt>high</dt>\n",
       "\t\t<dd>2546</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[low] 2854\n",
       "\\item[mid] 3625\n",
       "\\item[high] 2546\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "low\n",
       ":   2854mid\n",
       ":   3625high\n",
       ":   2546\n",
       "\n"
      ],
      "text/plain": [
       " low  mid high \n",
       "2854 3625 2546 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_la$simCat <- \"mid\"\n",
    "asap_la[asap_la$simdevnorm >= 0.5,]$simCat <- \"high\"\n",
    "asap_la[asap_la$simdevnorm <= -0.5,]$simCat <- \"low\"\n",
    "asap_la$simCat <- factor(asap_la$simCat, levels=c(\"low\",\"mid\",\"high\"))\n",
    "summary(asap_la$simCat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LMER for ASAP-la\n",
    "\n",
    "asap_lamodel <- bglmer(agree ~  \n",
    "                     alnorm  +\n",
    "                     simCat +\n",
    "                     diffLevel * correct_fac + \n",
    "                     scale(ans_homog) +\n",
    "                     (1|questionID),\n",
    "                  #   (1|studID), # just one observation per student\n",
    "                 data = asap_la,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))\n",
    "\n",
    "# removed interaction because of collinearity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(asap_lamodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 1.7422\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: \n",
       "agree ~ alnorm + simCat + diffLevel * correct_fac + scale(ans_homog) +  \n",
       "    (1 | questionID)\n",
       "   Data: asap_la\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  7821.7   7885.6  -3901.8   7803.7     9016 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-7.5377  0.1971  0.4442  0.5351  0.6990 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 0.313    0.5595  \n",
       "Number of obs: 9025, groups:  questionID, 5\n",
       "\n",
       "Fixed effects:\n",
       "                                Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                      1.64877    0.54441   3.029  0.00246 ** \n",
       "alnorm                          -0.22951    0.04563  -5.029 4.92e-07 ***\n",
       "simCatmid                       -0.13935    0.07301  -1.909  0.05629 .  \n",
       "simCathigh                      -0.12131    0.09699  -1.251  0.21105    \n",
       "diffLevelinference               0.56646    0.80947   0.700  0.48406    \n",
       "correct_fac1                     0.05307    0.08834   0.601  0.54803    \n",
       "scale(ans_homog)                -0.37410    0.40061  -0.934  0.35039    \n",
       "diffLevelinference:correct_fac1 -0.45563    0.13603  -3.349  0.00081 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg dffLvl crrc_1 scl(_)\n",
       "alnorm      -0.039                                          \n",
       "simCatmid   -0.081  0.401                                   \n",
       "simCathigh  -0.081  0.621  0.602                            \n",
       "dffLvlnfrnc -0.876  0.010  0.000  0.005                     \n",
       "correct_fc1 -0.105 -0.108  0.011  0.035  0.062              \n",
       "scl(ns_hmg) -0.667 -0.022 -0.012 -0.018  0.762 -0.011       \n",
       "dffLvlnf:_1  0.080 -0.178 -0.078 -0.130 -0.119 -0.615  0.003"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(asap_lamodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 0.4246\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ (1 | questionID)\n",
       "   Data: asap_la\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  7884.6   7898.8  -3940.3   7880.6     9023 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-4.7992  0.2084  0.4365  0.5276  0.5598 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 0.7535   0.868   \n",
       "Number of obs: 9025, groups:  questionID, 5\n",
       "\n",
       "Fixed effects:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)    1.738      0.384   4.525 6.04e-06 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_lamodel_empty <- bglmer(agree ~\n",
    "                   (1|questionID),               \n",
    "                 data = asap_la,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))\n",
    "summary(asap_lamodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>asap_lamodel_empty</th><td>2           </td><td>7884.617    </td><td>7898.833    </td><td>-3940.309   </td><td>7880.617    </td><td>      NA    </td><td>NA          </td><td>          NA</td></tr>\n",
       "\t<tr><th scope=row>asap_lamodel</th><td>9           </td><td>7821.668    </td><td>7885.637    </td><td>-3901.834   </td><td>7803.668    </td><td>76.94984    </td><td> 7          </td><td>5.758479e-14</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tasap\\_lamodel\\_empty & 2            & 7884.617     & 7898.833     & -3940.309    & 7880.617     &       NA     & NA           &           NA\\\\\n",
       "\tasap\\_lamodel & 9            & 7821.668     & 7885.637     & -3901.834    & 7803.668     & 76.94984     &  7           & 5.758479e-14\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|\n",
       "| asap_lamodel_empty | 2            | 7884.617     | 7898.833     | -3940.309    | 7880.617     |       NA     | NA           |           NA | \n",
       "| asap_lamodel | 9            | 7821.668     | 7885.637     | -3901.834    | 7803.668     | 76.94984     |  7           | 5.758479e-14 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                   Df AIC      BIC      logLik    deviance Chisq    Chi Df\n",
       "asap_lamodel_empty 2  7884.617 7898.833 -3940.309 7880.617       NA NA    \n",
       "asap_lamodel       9  7821.668 7885.637 -3901.834 7803.668 76.94984  7    \n",
       "                   Pr(>Chisq)  \n",
       "asap_lamodel_empty           NA\n",
       "asap_lamodel       5.758479e-14"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(asap_lamodel_empty, asap_lamodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>1.831364</td><td>1       </td><td>1.353279</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.667818</td><td>2       </td><td>1.136416</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>2.475594</td><td>1       </td><td>1.573402</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>1.786301</td><td>1       </td><td>1.336526</td></tr>\n",
       "\t<tr><th scope=row>scale(ans_homog)</th><td>2.441515</td><td>1       </td><td>1.562535</td></tr>\n",
       "\t<tr><th scope=row>diffLevel:correct_fac</th><td>1.830690</td><td>1       </td><td>1.353030</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 1.831364 & 1        & 1.353279\\\\\n",
       "\tsimCat & 1.667818 & 2        & 1.136416\\\\\n",
       "\tdiffLevel & 2.475594 & 1        & 1.573402\\\\\n",
       "\tcorrect\\_fac & 1.786301 & 1        & 1.336526\\\\\n",
       "\tscale(ans\\_homog) & 2.441515 & 1        & 1.562535\\\\\n",
       "\tdiffLevel:correct\\_fac & 1.830690 & 1        & 1.353030\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|---|---|\n",
       "| alnorm | 1.831364 | 1        | 1.353279 | \n",
       "| simCat | 1.667818 | 2        | 1.136416 | \n",
       "| diffLevel | 2.475594 | 1        | 1.573402 | \n",
       "| correct_fac | 1.786301 | 1        | 1.336526 | \n",
       "| scale(ans_homog) | 2.441515 | 1        | 1.562535 | \n",
       "| diffLevel:correct_fac | 1.830690 | 1        | 1.353030 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                      GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm                1.831364 1  1.353279       \n",
       "simCat                1.667818 2  1.136416       \n",
       "diffLevel             2.475594 1  1.573402       \n",
       "correct_fac           1.786301 1  1.336526       \n",
       "scale(ans_homog)      2.441515 1  1.562535       \n",
       "diffLevel:correct_fac 1.830690 1  1.353030       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "car::vif(asap_lamodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Collinearity ok, highest VIF around 1.6"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ASAP content assessment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   questionID         studID     language   correctness         anno1       \n",
       " asap_6 :1797   asap_1   :   1   en:8182   Min.   :0.0000   Min.   :0.0000  \n",
       " asap_5 :1795   asap_10  :   1             1st Qu.:0.0000   1st Qu.:0.0000  \n",
       " asap_1 :1672   asap_100 :   1             Median :0.3333   Median :0.3333  \n",
       " asap_10:1640   asap_1000:   1             Mean   :0.3491   Mean   :0.3491  \n",
       " asap_2 :1278   asap_1001:   1             3rd Qu.:0.6667   3rd Qu.:0.6667  \n",
       " asap_3 :   0   asap_1002:   1             Max.   :1.0000   Max.   :1.0000  \n",
       " (Other):   0   (Other)  :8176                                              \n",
       "     anno2         answerLength    questionLength         type     \n",
       " Min.   :0.0000   Min.   :   1.0   Min.   : 105.0   content :8182  \n",
       " 1st Qu.:0.0000   1st Qu.:  96.0   1st Qu.: 111.0   language:   0  \n",
       " Median :0.3333   Median : 183.0   Median : 727.0                  \n",
       " Mean   :0.3475   Mean   : 208.3   Mean   : 599.8                  \n",
       " 3rd Qu.:0.6667   3rd Qu.: 295.0   3rd Qu.: 799.0                  \n",
       " Max.   :1.0000   Max.   :1477.0   Max.   :1392.0                  \n",
       "                                                                   \n",
       "          diffLevel         Sim           ans_homog       collection       \n",
       " remember      :3592   Min.   :0.0000   Min.   :0.1848   Length:8182       \n",
       " understand    :3312   1st Qu.:0.2024   1st Qu.:0.2101   Class :character  \n",
       " reorganization:   0   Median :0.2873   Median :0.3362   Mode  :character  \n",
       " inference     :   0   Mean   :0.2868   Mean   :0.2868                     \n",
       " several       :1278   3rd Qu.:0.3697   3rd Qu.:0.3505                     \n",
       "                       Max.   :0.9134   Max.   :0.3914                     \n",
       "                                                                           \n",
       "     agree           corpus             weights  correct_fac\n",
       " Min.   :0.0000   Length:8182        Min.   :1   0:4987     \n",
       " 1st Qu.:1.0000   Class :character   1st Qu.:1   1:3195     \n",
       " Median :1.0000   Mode  :character   Median :1              \n",
       " Mean   :0.9742                      Mean   :1              \n",
       " 3rd Qu.:1.0000                      3rd Qu.:1              \n",
       " Max.   :1.0000                      Max.   :1              \n",
       "                                                            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "   0    1 \n",
       " 211 7971 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(asap_ca)\n",
    "table(as.factor(as.integer(abs(asap_ca$anno1-asap_ca$anno2) < 0.5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "asap_ca$alnorm <- scale(log(asap_ca$answerLength +1))\n",
    "asap_ca$qlnorm <- scale(log(asap_ca$questionLength +1))\n",
    "#hist(asap_ca$alnorm)\n",
    "#hist(asap_ca$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "asap_ca$relsim <- asap_ca$Sim - asap_ca$ans_homog\n",
    "#pg$relsim\n",
    "per_q_sd <- asap_ca[, sd(relsim), by=questionID]\n",
    "qid_idx <- which(colnames(asap_ca) == \"questionID\")\n",
    "rs_idx <- which(colnames(asap_ca) == \"relsim\")\n",
    "asap_ca$simdevnorm <- apply(asap_ca, 1, function(row) {\n",
    "    relsim <- as.numeric(row[rs_idx])\n",
    "    qid <- row[qid_idx]\n",
    "    relsim / per_q_sd[questionID == qid]$V1\n",
    "    })\n",
    "rm(qid_idx,rs_idx,per_q_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>low</dt>\n",
       "\t\t<dd>2457</dd>\n",
       "\t<dt>mid</dt>\n",
       "\t\t<dd>3369</dd>\n",
       "\t<dt>high</dt>\n",
       "\t\t<dd>2356</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[low] 2457\n",
       "\\item[mid] 3369\n",
       "\\item[high] 2356\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "low\n",
       ":   2457mid\n",
       ":   3369high\n",
       ":   2356\n",
       "\n"
      ],
      "text/plain": [
       " low  mid high \n",
       "2457 3369 2356 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_ca$simCat <- \"mid\"\n",
    "asap_ca[asap_ca$simdevnorm >= 0.5,]$simCat <- \"high\"\n",
    "asap_ca[asap_ca$simdevnorm <= -0.5,]$simCat <- \"low\"\n",
    "asap_ca$simCat <- factor(asap_ca$simCat, levels=c(\"low\",\"mid\",\"high\"))\n",
    "summary(asap_ca$simCat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "asap_camodel <- bglmer(agree ~  \n",
    "                     alnorm  +\n",
    "                     simCat +\n",
    "                     diffLevel +\n",
    "                     correct_fac +\n",
    "#                     scale(ans_homog) +\n",
    "                     (1|questionID),\n",
    "                  #   (1|studID), # just one observation per student\n",
    "                 data = asap_ca,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))\n",
    "\n",
    "# diffLevel removed for multicollinearity reasons\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(asap_camodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : -1.59\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ alnorm + simCat + diffLevel + correct_fac + (1 | questionID)\n",
       "   Data: asap_ca\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  1459.7   1515.8   -721.8   1443.7     8174 \n",
       "\n",
       "Scaled residuals: \n",
       "     Min       1Q   Median       3Q      Max \n",
       "-21.8243   0.0265   0.0589   0.0975   0.5414 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 2.886    1.699   \n",
       "Number of obs: 8182, groups:  questionID, 5\n",
       "\n",
       "Fixed effects:\n",
       "                    Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)           7.7703     1.4824   5.242 1.59e-07 ***\n",
       "alnorm               -0.2453     0.1472  -1.667  0.09555 .  \n",
       "simCatmid            -0.1707     0.1979  -0.862  0.38849    \n",
       "simCathigh            0.3174     0.2792   1.137  0.25559    \n",
       "diffLevelunderstand  -4.5438     1.9080  -2.382  0.01724 *  \n",
       "diffLevelseveral     -2.7800     2.2823  -1.218  0.22320    \n",
       "correct_fac1          0.5809     0.1926   3.016  0.00256 ** \n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg dffLvln dffLvls\n",
       "alnorm      -0.050                                     \n",
       "simCatmid   -0.090  0.548                              \n",
       "simCathigh  -0.072  0.701  0.649                       \n",
       "dffLvlndrst -0.767 -0.010  0.000 -0.012                \n",
       "diffLvlsvrl -0.642 -0.042 -0.021 -0.033  0.501         \n",
       "correct_fc1  0.015 -0.400 -0.152 -0.173 -0.048  -0.020 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(asap_camodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : -3.2282\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ (1 | questionID)\n",
       "   Data: asap_ca\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  1474.1   1488.1   -735.0   1470.1     8180 \n",
       "\n",
       "Scaled residuals: \n",
       "     Min       1Q   Median       3Q      Max \n",
       "-24.1089   0.0415   0.0790   0.0811   0.3605 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 8.603    2.933   \n",
       "Number of obs: 8182, groups:  questionID, 5\n",
       "\n",
       "Fixed effects:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)    5.622      1.351    4.16 3.18e-05 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_camodel_empty <- bglmer(agree ~\n",
    "                   (1|questionID),               \n",
    "                 data = asap_ca,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))\n",
    "summary(asap_camodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>asap_camodel_empty</th><td>2           </td><td>1474.088    </td><td>1488.107    </td><td>-735.0438   </td><td>1470.088    </td><td>      NA    </td><td>NA          </td><td>          NA</td></tr>\n",
       "\t<tr><th scope=row>asap_camodel</th><td>8           </td><td>1459.687    </td><td>1515.765    </td><td>-721.8435   </td><td>1443.687    </td><td>26.40056    </td><td> 6          </td><td>0.0001874576</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tasap\\_camodel\\_empty & 2            & 1474.088     & 1488.107     & -735.0438    & 1470.088     &       NA     & NA           &           NA\\\\\n",
       "\tasap\\_camodel & 8            & 1459.687     & 1515.765     & -721.8435    & 1443.687     & 26.40056     &  6           & 0.0001874576\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|\n",
       "| asap_camodel_empty | 2            | 1474.088     | 1488.107     | -735.0438    | 1470.088     |       NA     | NA           |           NA | \n",
       "| asap_camodel | 8            | 1459.687     | 1515.765     | -721.8435    | 1443.687     | 26.40056     |  6           | 0.0001874576 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                   Df AIC      BIC      logLik    deviance Chisq    Chi Df\n",
       "asap_camodel_empty 2  1474.088 1488.107 -735.0438 1470.088       NA NA    \n",
       "asap_camodel       8  1459.687 1515.765 -721.8435 1443.687 26.40056  6    \n",
       "                   Pr(>Chisq)  \n",
       "asap_camodel_empty           NA\n",
       "asap_camodel       0.0001874576"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(asap_camodel_empty, asap_camodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>2.404266</td><td>1       </td><td>1.550569</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>2.084907</td><td>2       </td><td>1.201633</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>1.005492</td><td>2       </td><td>1.001370</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>1.227891</td><td>1       </td><td>1.108103</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 2.404266 & 1        & 1.550569\\\\\n",
       "\tsimCat & 2.084907 & 2        & 1.201633\\\\\n",
       "\tdiffLevel & 1.005492 & 2        & 1.001370\\\\\n",
       "\tcorrect\\_fac & 1.227891 & 1        & 1.108103\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|\n",
       "| alnorm | 2.404266 | 1        | 1.550569 | \n",
       "| simCat | 2.084907 | 2        | 1.201633 | \n",
       "| diffLevel | 1.005492 | 2        | 1.001370 | \n",
       "| correct_fac | 1.227891 | 1        | 1.108103 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "            GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm      2.404266 1  1.550569       \n",
       "simCat      2.084907 2  1.201633       \n",
       "diffLevel   1.005492 2  1.001370       \n",
       "correct_fac 1.227891 1  1.108103       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "car::vif(asap_camodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Rather excessive collearity btw diffLevel and ans_homog (VIFs ~70). Removed diffLevel -> VIFs < 1.6"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ASAP-DE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "      questionID          studID    language  correctness         anno1       \n",
       " asap_de_1 :301   asap_de_1  :  1   de:903   Min.   :0.0000   Min.   :0.0000  \n",
       " asap_de_10:301   asap_de_10 :  1            1st Qu.:0.0000   1st Qu.:0.0000  \n",
       " asap_de_2 :301   asap_de_100:  1            Median :0.3333   Median :0.3333  \n",
       "                  asap_de_101:  1            Mean   :0.3619   Mean   :0.3619  \n",
       "                  asap_de_102:  1            3rd Qu.:0.6667   3rd Qu.:0.6667  \n",
       "                  asap_de_103:  1            Max.   :1.0000   Max.   :1.0000  \n",
       "                  (Other)    :897                                             \n",
       "     anno2         answerLength    questionLength      type    \n",
       " Min.   :0.0000   Min.   :   1.0   Min.   : 797   content:903  \n",
       " 1st Qu.:0.0000   1st Qu.:  86.0   1st Qu.: 797                \n",
       " Median :0.5000   Median : 145.0   Median : 888                \n",
       " Mean   :0.4406   Mean   : 174.9   Mean   :1138                \n",
       " 3rd Qu.:0.6667   3rd Qu.: 227.5   3rd Qu.:1728                \n",
       " Max.   :1.0000   Max.   :1338.0   Max.   :1728                \n",
       "                                                               \n",
       "      diffLevel        Sim           ans_homog       collection       \n",
       " understand:602   Min.   :0.0000   Min.   :0.1773   Length:903        \n",
       " several   :301   1st Qu.:0.1708   1st Qu.:0.1773   Class :character  \n",
       "                  Median :0.2093   Median :0.2024   Mode  :character  \n",
       "                  Mean   :0.2219   Mean   :0.2219                     \n",
       "                  3rd Qu.:0.2620   3rd Qu.:0.2861                     \n",
       "                  Max.   :0.5248   Max.   :0.2861                     \n",
       "                                                                      "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_de <- data.table(read.csv(\"data/ASAP_DE.txt\",sep=\"\\t\"))\n",
    "asap_de$studID <- as.factor(paste(\"asap_de\",asap_de$studID,sep=\"_\"))\n",
    "asap_de$questionID <- as.factor(paste(\"asap_de\",asap_de$questionID,sep=\"_\"))\n",
    "asap_de$diffLevel <- factor(asap_de$diffLevel, levels=c(\"understand\",\"several\"))\n",
    "asap_de$collection <- \"research\"\n",
    "summary(asap_de)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. \n",
       " 0.0000  1.0000  1.0000  0.8328  1.0000  1.0000 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "  0   1 \n",
       "151 752 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_de$agree <- as.integer(abs(asap_de$anno1-asap_de$anno2) < 0.5)\n",
    "summary(asap_de$agree)\n",
    "table(as.factor(asap_de$agree))\n",
    "asap_de$corpus <- \"asap_de\"\n",
    "asap_de$weights <- round(nrow(asap)/nrow(asap_de))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>0</dt>\n",
       "\t\t<dd>501</dd>\n",
       "\t<dt>1</dt>\n",
       "\t\t<dd>402</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[0] 501\n",
       "\\item[1] 402\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "0\n",
       ":   5011\n",
       ":   402\n",
       "\n"
      ],
      "text/plain": [
       "  0   1 \n",
       "501 402 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_de$correct_fac <- '0'\n",
    "asap_de[asap_de$correctness >= 0.5,]$correct_fac <- '1'\n",
    "asap_de$correct_fac <- as.factor(asap_de$correct_fac)\n",
    "summary(asap_de$correct_fac)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "asap_de$alnorm <- scale(log(asap_de$answerLength +1))\n",
    "asap_de$qlnorm <- scale(log(asap_de$questionLength +1))\n",
    "#hist(asap_de$alnorm)\n",
    "#hist(asap_de$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "asap_de$relsim <- asap_de$Sim - asap_de$ans_homog\n",
    "#pg$relsim\n",
    "per_q_sd <- asap_de[, sd(relsim), by=questionID]\n",
    "qid_idx <- which(colnames(asap_de) == \"questionID\")\n",
    "rs_idx <- which(colnames(asap_de) == \"relsim\")\n",
    "asap_de$simdevnorm <- apply(asap_de, 1, function(row) {\n",
    "    relsim <- as.numeric(row[rs_idx])\n",
    "    qid <- row[qid_idx]\n",
    "    relsim / per_q_sd[questionID == qid]$V1\n",
    "    })\n",
    "rm(qid_idx,rs_idx,per_q_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>low</dt>\n",
       "\t\t<dd>272</dd>\n",
       "\t<dt>mid</dt>\n",
       "\t\t<dd>370</dd>\n",
       "\t<dt>high</dt>\n",
       "\t\t<dd>261</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[low] 272\n",
       "\\item[mid] 370\n",
       "\\item[high] 261\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "low\n",
       ":   272mid\n",
       ":   370high\n",
       ":   261\n",
       "\n"
      ],
      "text/plain": [
       " low  mid high \n",
       " 272  370  261 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "asap_de$simCat <- \"mid\"\n",
    "asap_de[asap_de$simdevnorm >= 0.5,]$simCat <- \"high\"\n",
    "asap_de[asap_de$simdevnorm <= -0.5,]$simCat <- \"low\"\n",
    "asap_de$simCat <- factor(asap_de$simCat, levels=c(\"low\",\"mid\",\"high\"))\n",
    "summary(asap_de$simCat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LMER for ASAP-de\n",
    "# drop the interaction for reasons of multicollinearity\n",
    "\n",
    "asapdemodel <- bglmer(agree ~  \n",
    "                     alnorm  +\n",
    "                     simCat + \n",
    "                     diffLevel  * correct_fac + \n",
    "                     scale(ans_homog) +\n",
    "                     (1|questionID),\n",
    "                  #   (1|studID), # just one observation per student\n",
    "                 data = asap_de,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(asapdemodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 4.8968\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: \n",
       "agree ~ alnorm + simCat + diffLevel * correct_fac + scale(ans_homog) +  \n",
       "    (1 | questionID)\n",
       "   Data: asap_de\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "   668.8    712.1   -325.4    650.8      894 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-5.7349  0.1223  0.2690  0.3659  1.7924 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 0.03821  0.1955  \n",
       "Number of obs: 903, groups:  questionID, 3\n",
       "\n",
       "Fixed effects:\n",
       "                              Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                    1.65277    0.29505   5.602 2.12e-08 ***\n",
       "alnorm                        -0.34942    0.15567  -2.245  0.02479 *  \n",
       "simCatmid                     -0.08953    0.26267  -0.341  0.73322    \n",
       "simCathigh                    -0.18448    0.34441  -0.536  0.59221    \n",
       "diffLevelseveral               0.15339    0.43725   0.351  0.72573    \n",
       "correct_fac1                   1.49364    0.26460   5.645 1.65e-08 ***\n",
       "scale(ans_homog)              -1.33677    0.18792  -7.113 1.13e-12 ***\n",
       "diffLevelseveral:correct_fac1 -1.23871    0.47265  -2.621  0.00877 ** \n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg dffLvl crrc_1 scl(_)\n",
       "alnorm      -0.249                                          \n",
       "simCatmid   -0.531  0.410                                   \n",
       "simCathigh  -0.498  0.639  0.624                            \n",
       "diffLvlsvrl -0.472 -0.130 -0.057 -0.097                     \n",
       "correct_fc1 -0.226 -0.331 -0.029 -0.055  0.145              \n",
       "scl(ns_hmg) -0.319  0.018 -0.019 -0.060  0.413 -0.243       \n",
       "dffLvlsv:_1  0.174  0.006 -0.082 -0.067 -0.395 -0.497  0.130"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(asapdemodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : -1.3594\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ (1 | questionID)\n",
       "   Data: asap_de\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "   701.6    711.2   -348.8    697.6      901 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-4.8123  0.2078  0.3209  0.3209  0.7611 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " questionID (Intercept) 2.475    1.573   \n",
       "Number of obs: 903, groups:  questionID, 3\n",
       "\n",
       "Fixed effects:\n",
       "            Estimate Std. Error z value Pr(>|z|)  \n",
       "(Intercept)   2.0048     0.9167   2.187   0.0287 *\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# random only  model for ASAP-de\n",
    "asapdemodel_empty <- bglmer(agree ~\n",
    "                   (1|questionID), \n",
    "                 data = asap_de,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))\n",
    "summary(asapdemodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>asapdemodel_empty</th><td>2           </td><td>701.6230    </td><td>711.2344    </td><td>-348.8115   </td><td>697.6230    </td><td>      NA    </td><td>NA          </td><td>          NA</td></tr>\n",
       "\t<tr><th scope=row>asapdemodel</th><td>9           </td><td>668.8337    </td><td>712.0852    </td><td>-325.4168   </td><td>650.8337    </td><td>46.78931    </td><td> 7          </td><td>6.135857e-08</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tasapdemodel\\_empty & 2            & 701.6230     & 711.2344     & -348.8115    & 697.6230     &       NA     & NA           &           NA\\\\\n",
       "\tasapdemodel & 9            & 668.8337     & 712.0852     & -325.4168    & 650.8337     & 46.78931     &  7           & 6.135857e-08\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|\n",
       "| asapdemodel_empty | 2            | 701.6230     | 711.2344     | -348.8115    | 697.6230     |       NA     | NA           |           NA | \n",
       "| asapdemodel | 9            | 668.8337     | 712.0852     | -325.4168    | 650.8337     | 46.78931     |  7           | 6.135857e-08 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                  Df AIC      BIC      logLik    deviance Chisq    Chi Df\n",
       "asapdemodel_empty 2  701.6230 711.2344 -348.8115 697.6230       NA NA    \n",
       "asapdemodel       9  668.8337 712.0852 -325.4168 650.8337 46.78931  7    \n",
       "                  Pr(>Chisq)  \n",
       "asapdemodel_empty           NA\n",
       "asapdemodel       6.135857e-08"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(asapdemodel_empty, asapdemodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>2.067943</td><td>1       </td><td>1.438034</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.787117</td><td>2       </td><td>1.156214</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>1.651558</td><td>1       </td><td>1.285129</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>1.683008</td><td>1       </td><td>1.297308</td></tr>\n",
       "\t<tr><th scope=row>scale(ans_homog)</th><td>1.436808</td><td>1       </td><td>1.198669</td></tr>\n",
       "\t<tr><th scope=row>diffLevel:correct_fac</th><td>1.741473</td><td>1       </td><td>1.319649</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 2.067943 & 1        & 1.438034\\\\\n",
       "\tsimCat & 1.787117 & 2        & 1.156214\\\\\n",
       "\tdiffLevel & 1.651558 & 1        & 1.285129\\\\\n",
       "\tcorrect\\_fac & 1.683008 & 1        & 1.297308\\\\\n",
       "\tscale(ans\\_homog) & 1.436808 & 1        & 1.198669\\\\\n",
       "\tdiffLevel:correct\\_fac & 1.741473 & 1        & 1.319649\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|---|---|\n",
       "| alnorm | 2.067943 | 1        | 1.438034 | \n",
       "| simCat | 1.787117 | 2        | 1.156214 | \n",
       "| diffLevel | 1.651558 | 1        | 1.285129 | \n",
       "| correct_fac | 1.683008 | 1        | 1.297308 | \n",
       "| scale(ans_homog) | 1.436808 | 1        | 1.198669 | \n",
       "| diffLevel:correct_fac | 1.741473 | 1        | 1.319649 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                      GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm                2.067943 1  1.438034       \n",
       "simCat                1.787117 2  1.156214       \n",
       "diffLevel             1.651558 1  1.285129       \n",
       "correct_fac           1.683008 1  1.297308       \n",
       "scale(ans_homog)      1.436808 1  1.198669       \n",
       "diffLevel:correct_fac 1.741473 1  1.319649       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "car::vif(asapdemodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Collinearity fine"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CSSAG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    questionID        studID     language   correctness         anno1       \n",
       " cssag_2 :  82   cssag_1 :  24   de:1768   Min.   :0.0000   Min.   :0.0000  \n",
       " cssag_19:  76   cssag_14:  24             1st Qu.:0.0000   1st Qu.:0.0000  \n",
       " cssag_21:  73   cssag_18:  24             Median :0.5000   Median :0.5000  \n",
       " cssag_1 :  71   cssag_34:  24             Mean   :0.5143   Mean   :0.4923  \n",
       " cssag_7 :  70   cssag_38:  24             3rd Qu.:1.0000   3rd Qu.:1.0000  \n",
       " cssag_28:  69   cssag_5 :  24             Max.   :1.0000   Max.   :1.0000  \n",
       " (Other) :1327   (Other) :1624                                              \n",
       "     anno2         answerLength   questionLength       type     \n",
       " Min.   :0.0000   Min.   :  3.0   Min.   : 32.0   content:1768  \n",
       " 1st Qu.:0.0000   1st Qu.: 74.0   1st Qu.: 61.0                 \n",
       " Median :0.5000   Median :128.0   Median :131.0                 \n",
       " Mean   :0.5182   Mean   :148.3   Mean   :139.4                 \n",
       " 3rd Qu.:1.0000   3rd Qu.:197.0   3rd Qu.:163.0                 \n",
       " Max.   :1.0000   Max.   :778.0   Max.   :560.0                 \n",
       "                                                                \n",
       "      diffLevel        Sim           ans_homog       collection       \n",
       " remember  :990   Min.   :0.0000   Min.   :0.1846   Length:1768       \n",
       " understand:601   1st Qu.:0.2254   1st Qu.:0.2690   Class :character  \n",
       " apply     :177   Median :0.2954   Median :0.2932   Mode  :character  \n",
       "                  Mean   :0.3104   Mean   :0.3104                     \n",
       "                  3rd Qu.:0.3714   3rd Qu.:0.3420                     \n",
       "                  Max.   :0.9420   Max.   :0.4537                     \n",
       "                                                                      "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cssag <- data.table(read.csv(\"data/CSSAG.txt\",sep=\"\\t\"))\n",
    "cssag <- na.omit(cssag)\n",
    "cssag$studID <- as.factor(paste(\"cssag\",cssag$studID,sep=\"_\"))\n",
    "cssag$questionID <- as.factor(paste(\"cssag\",cssag$questionID,sep=\"_\"))\n",
    "cssag$diffLevel <- factor(cssag$diffLevel, levels=c(\"remember\",\"understand\",\"apply\"))\n",
    "cssag$collection <- \"classroom\"\n",
    "summary(cssag)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. \n",
       " 0.0000  1.0000  1.0000  0.7721  1.0000  1.0000 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\n",
       "   0    1 \n",
       " 403 1365 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cssag$agree <- as.integer(abs(cssag$anno1-cssag$anno2) < 0.5)\n",
    "summary(cssag$agree)\n",
    "table(as.factor(cssag$agree))\n",
    "cssag$corpus <- \"cssag\"\n",
    "cssag$weights <- round(nrow(asap)/nrow(cssag))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>0</dt>\n",
       "\t\t<dd>1004</dd>\n",
       "\t<dt>1</dt>\n",
       "\t\t<dd>764</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[0] 1004\n",
       "\\item[1] 764\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "0\n",
       ":   10041\n",
       ":   764\n",
       "\n"
      ],
      "text/plain": [
       "   0    1 \n",
       "1004  764 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cssag$correct_fac <- '0'\n",
    "cssag[cssag$correctness > 0.5,]$correct_fac <- '1'\n",
    "cssag$correct_fac <- as.factor(cssag$correct_fac)\n",
    "summary(cssag$correct_fac)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "cssag$alnorm <- scale(log(cssag$answerLength +1))\n",
    "cssag$qlnorm <- scale(log(cssag$questionLength +1))\n",
    "#hist(cssag$alnorm)\n",
    "#hist(cssag$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "cssag$relsim <- cssag$Sim - cssag$ans_homog\n",
    "#pg$relsim\n",
    "per_q_sd <- cssag[, sd(relsim), by=questionID]\n",
    "qid_idx <- which(colnames(cssag) == \"questionID\")\n",
    "rs_idx <- which(colnames(cssag) == \"relsim\")\n",
    "cssag$simdevnorm <- apply(cssag, 1, function(row) {\n",
    "    relsim <- as.numeric(row[rs_idx])\n",
    "    qid <- row[qid_idx]\n",
    "    relsim / per_q_sd[questionID == qid]$V1\n",
    "    })\n",
    "rm(qid_idx,rs_idx,per_q_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<dl class=dl-horizontal>\n",
       "\t<dt>low</dt>\n",
       "\t\t<dd>583</dd>\n",
       "\t<dt>mid</dt>\n",
       "\t\t<dd>703</dd>\n",
       "\t<dt>high</dt>\n",
       "\t\t<dd>482</dd>\n",
       "</dl>\n"
      ],
      "text/latex": [
       "\\begin{description*}\n",
       "\\item[low] 583\n",
       "\\item[mid] 703\n",
       "\\item[high] 482\n",
       "\\end{description*}\n"
      ],
      "text/markdown": [
       "low\n",
       ":   583mid\n",
       ":   703high\n",
       ":   482\n",
       "\n"
      ],
      "text/plain": [
       " low  mid high \n",
       " 583  703  482 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cssag$simCat <- \"mid\"\n",
    "cssag[cssag$simdevnorm >= 0.5,]$simCat <- \"high\"\n",
    "cssag[cssag$simdevnorm <= -0.5,]$simCat <- \"low\"\n",
    "cssag$simCat <- factor(cssag$simCat, levels=c(\"low\",\"mid\",\"high\"))\n",
    "summary(cssag$simCat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LMER for CSSAG\n",
    "# drop interaction b/c of multicollinearity\n",
    "\n",
    "cssagmodel <- bglmer(agree ~\n",
    "                     alnorm +                    \n",
    "                     simCat +\n",
    "                     diffLevel * correct_fac + \n",
    "                     scale(ans_homog) +\n",
    "                     (1|questionID) +\n",
    "                     (1|studID), # just one observation per student\n",
    "                  data = cssag,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 2.8643\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: \n",
       "agree ~ alnorm + simCat + diffLevel * correct_fac + scale(ans_homog) +  \n",
       "    (1 | questionID) + (1 | studID)\n",
       "   Data: cssag\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  1617.8   1683.5   -796.9   1593.8     1756 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-5.4019  0.1824  0.2928  0.4936  1.8721 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 0.1191   0.345   \n",
       " questionID (Intercept) 1.2444   1.116   \n",
       "Number of obs: 1768, groups:  studID, 321; questionID, 31\n",
       "\n",
       "Fixed effects:\n",
       "                                 Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                       2.06256    0.32884   6.272 3.56e-10 ***\n",
       "alnorm                           -0.32890    0.09505  -3.460  0.00054 ***\n",
       "simCatmid                        -0.18755    0.16289  -1.151  0.24958    \n",
       "simCathigh                       -0.53416    0.19685  -2.714  0.00666 ** \n",
       "diffLevelunderstand              -0.76916    0.47867  -1.607  0.10808    \n",
       "diffLevelapply                   -0.98325    0.79802  -1.232  0.21791    \n",
       "correct_fac1                      0.30573    0.20704   1.477  0.13976    \n",
       "scale(ans_homog)                  0.27402    0.22428   1.222  0.22178    \n",
       "diffLevelunderstand:correct_fac1 -0.16293    0.31242  -0.521  0.60202    \n",
       "diffLevelapply:correct_fac1       0.20533    0.51137   0.402  0.68803    \n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg dffLvln dffLvlp crrc_1 scl(_)\n",
       "alnorm      -0.105                                                   \n",
       "simCatmid   -0.284  0.308                                            \n",
       "simCathigh  -0.265  0.544  0.570                                     \n",
       "dffLvlndrst -0.611 -0.006  0.006  0.015                              \n",
       "diffLvlpply -0.368 -0.005 -0.010 -0.010  0.212                       \n",
       "correct_fc1 -0.265 -0.254 -0.075 -0.109  0.194   0.128               \n",
       "scl(ns_hmg)  0.025 -0.027 -0.021 -0.016  0.131  -0.288  -0.025       \n",
       "dffLvlnd:_1  0.207  0.025 -0.013 -0.064 -0.246  -0.081  -0.627  0.006\n",
       "dffLvlpp:_1  0.097 -0.005  0.064  0.017 -0.087  -0.244  -0.377 -0.065\n",
       "            dffLvln:_1\n",
       "alnorm                \n",
       "simCatmid             \n",
       "simCathigh            \n",
       "dffLvlndrst           \n",
       "diffLvlpply           \n",
       "correct_fc1           \n",
       "scl(ns_hmg)           \n",
       "dffLvlnd:_1           \n",
       "dffLvlpp:_1  0.251    "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(cssagmodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 2.4082\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ (1 | questionID) + (1 | studID)\n",
       "   Data: cssag\n",
       "Control: glmerControl(optimizer = \"bobyqa\")\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       "  1619.8   1636.2   -806.9   1613.8     1765 \n",
       "\n",
       "Scaled residuals: \n",
       "    Min      1Q  Median      3Q     Max \n",
       "-4.8392  0.1983  0.3118  0.4961  1.8035 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 0.1281   0.3579  \n",
       " questionID (Intercept) 1.5676   1.2521  \n",
       "Number of obs: 1768, groups:  studID, 321; questionID, 31\n",
       "\n",
       "Fixed effects:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)   1.5797     0.2406   6.566 5.18e-11 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# random-only model for CSSAG\n",
    "\n",
    "cssagmodel_empty <- bglmer(agree ~\n",
    "                   (1|questionID) + \n",
    "                   (1|studID),\n",
    "                 data = cssag,\n",
    "                  family=\"binomial\", control = glmerControl(optimizer = \"bobyqa\"))\n",
    "summary(cssagmodel_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>cssagmodel_empty</th><td> 3        </td><td>1619.800  </td><td>1636.233  </td><td>-806.900  </td><td>1613.800  </td><td>      NA  </td><td>NA        </td><td>        NA</td></tr>\n",
       "\t<tr><th scope=row>cssagmodel</th><td>12        </td><td>1617.802  </td><td>1683.533  </td><td>-796.901  </td><td>1593.802  </td><td>19.99794  </td><td> 9        </td><td>0.01792513</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tcssagmodel\\_empty &  3         & 1619.800   & 1636.233   & -806.900   & 1613.800   &       NA   & NA         &         NA\\\\\n",
       "\tcssagmodel & 12         & 1617.802   & 1683.533   & -796.901   & 1593.802   & 19.99794   &  9         & 0.01792513\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|\n",
       "| cssagmodel_empty |  3         | 1619.800   | 1636.233   | -806.900   | 1613.800   |       NA   | NA         |         NA | \n",
       "| cssagmodel | 12         | 1617.802   | 1683.533   | -796.901   | 1593.802   | 19.99794   |  9         | 0.01792513 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                 Df AIC      BIC      logLik   deviance Chisq    Chi Df\n",
       "cssagmodel_empty  3 1619.800 1636.233 -806.900 1613.800       NA NA    \n",
       "cssagmodel       12 1617.802 1683.533 -796.901 1593.802 19.99794  9    \n",
       "                 Pr(>Chisq)\n",
       "cssagmodel_empty         NA\n",
       "cssagmodel       0.01792513"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(cssagmodel_empty, cssagmodel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>1.541618</td><td>1       </td><td>1.241619</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.443064</td><td>2       </td><td>1.096027</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>1.312700</td><td>2       </td><td>1.070388</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>2.028152</td><td>1       </td><td>1.424132</td></tr>\n",
       "\t<tr><th scope=row>scale(ans_homog)</th><td>1.172865</td><td>1       </td><td>1.082989</td></tr>\n",
       "\t<tr><th scope=row>diffLevel:correct_fac</th><td>2.112999</td><td>2       </td><td>1.205660</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 1.541618 & 1        & 1.241619\\\\\n",
       "\tsimCat & 1.443064 & 2        & 1.096027\\\\\n",
       "\tdiffLevel & 1.312700 & 2        & 1.070388\\\\\n",
       "\tcorrect\\_fac & 2.028152 & 1        & 1.424132\\\\\n",
       "\tscale(ans\\_homog) & 1.172865 & 1        & 1.082989\\\\\n",
       "\tdiffLevel:correct\\_fac & 2.112999 & 2        & 1.205660\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|---|---|\n",
       "| alnorm | 1.541618 | 1        | 1.241619 | \n",
       "| simCat | 1.443064 | 2        | 1.096027 | \n",
       "| diffLevel | 1.312700 | 2        | 1.070388 | \n",
       "| correct_fac | 2.028152 | 1        | 1.424132 | \n",
       "| scale(ans_homog) | 1.172865 | 1        | 1.082989 | \n",
       "| diffLevel:correct_fac | 2.112999 | 2        | 1.205660 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                      GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm                1.541618 1  1.241619       \n",
       "simCat                1.443064 2  1.096027       \n",
       "diffLevel             1.312700 2  1.070388       \n",
       "correct_fac           2.028152 1  1.424132       \n",
       "scale(ans_homog)      1.172865 1  1.082989       \n",
       "diffLevel:correct_fac 2.112999 2  1.205660       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "car::vif(cssagmodel)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "collinearity is OK"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Creating a combined dataset of all corpora"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   questionID           studID      language    correctness         anno1      \n",
       " asap_3 : 1891   cree_AU066:   47   en:23474   Min.   :0.0000   Min.   :0.000  \n",
       " asap_7 : 1799   cree_AU068:   47   de: 6754   1st Qu.:0.0000   1st Qu.:0.000  \n",
       " asap_8 : 1799   cree_AU061:   46              Median :0.5000   Median :0.500  \n",
       " asap_9 : 1798   creg_110  :   44              Mean   :0.5587   Mean   :0.558  \n",
       " asap_6 : 1797   creg_220  :   42              3rd Qu.:1.0000   3rd Qu.:1.000  \n",
       " asap_5 : 1795   creg_230  :   42              Max.   :1.0000   Max.   :1.000  \n",
       " (Other):19349   (Other)   :29960                                              \n",
       "     anno2         answerLength    questionLength         type      \n",
       " Min.   :0.0000   Min.   :   1.0   Min.   :  19.0   content :16253  \n",
       " 1st Qu.:0.0000   1st Qu.:  35.0   1st Qu.:  63.0   language:13975  \n",
       " Median :0.5000   Median : 107.0   Median : 105.0                   \n",
       " Mean   :0.5728   Mean   : 151.1   Mean   : 229.4                   \n",
       " 3rd Qu.:1.0000   3rd Qu.: 231.0   3rd Qu.: 165.0                   \n",
       " Max.   :1.0000   Max.   :1819.0   Max.   :1728.0                   \n",
       "                                                                    \n",
       "          diffLevel          Sim           ans_homog              collection   \n",
       " remember      :11561   Min.   :0.0000   Min.   :0.09764   classroom   : 6718  \n",
       " literal       : 4024   1st Qu.:0.2431   1st Qu.:0.28863   research    : 7581  \n",
       " reorganization: 4241   Median :0.3293   Median :0.33625   standardized:15929  \n",
       " inference     : 5710   Mean   :0.3647   Mean   :0.36474                       \n",
       " understand    : 4515   3rd Qu.:0.4494   3rd Qu.:0.42179                       \n",
       " several       :    0   Max.   :1.0000   Max.   :0.96770                       \n",
       " apply         :  177                                                          \n",
       "     agree            corpus         weights       correct_fac\n",
       " Min.   :0.0000   asap   :15929   Min.   : 1.000   0:11322    \n",
       " 1st Qu.:1.0000   asap_de:  602   1st Qu.: 1.000   1:18906    \n",
       " Median :1.0000   cree   :  566   Median : 1.000              \n",
       " Mean   :0.8931   creg   : 4384   Mean   : 3.094              \n",
       " 3rd Qu.:1.0000   cssag  : 1768   3rd Qu.: 2.000              \n",
       " Max.   :1.0000   pg     : 6979   Max.   :30.000              \n",
       "                                                              \n",
       "     alnorm             qlnorm             relsim            simdevnorm      \n",
       " Min.   :-6.33180   Min.   :-2.32228   Min.   :-0.844839   Min.   :-4.96807  \n",
       " 1st Qu.:-0.62920   1st Qu.:-1.01890   1st Qu.:-0.056622   1st Qu.:-0.65517  \n",
       " Median : 0.02694   Median : 0.30027   Median : 0.001756   Median : 0.02352  \n",
       " Mean   :-0.03224   Mean   :-0.02489   Mean   : 0.000000   Mean   : 0.00000  \n",
       " 3rd Qu.: 0.64980   3rd Qu.: 0.63025   3rd Qu.: 0.071669   3rd Qu.: 0.68920  \n",
       " Max.   : 5.26938   Max.   : 2.51564   Max.   : 0.615997   Max.   : 8.63439  \n",
       "                                                                             \n",
       "  simCat     \n",
       " low : 9096  \n",
       " mid :11400  \n",
       " high: 9732  \n",
       "             \n",
       "             \n",
       "             \n",
       "             "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "combined <- rbind(pg, cree, creg, asap_la, asap_ca, asap_de, cssag)\n",
    "combined$corpus <- as.factor(combined$corpus)\n",
    "combined$collection <- as.factor(combined$collection)\n",
    "combined <- combined[combined$diffLevel != \"several\",]\n",
    "summary(combined)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "combined$alnorm <- scale(log(combined$answerLength +1))\n",
    "combined$qlnorm <- scale(log(combined$questionLength +1))\n",
    "#hist(combined$alnorm)\n",
    "#hist(combined$qlnorm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "5710"
      ],
      "text/latex": [
       "5710"
      ],
      "text/markdown": [
       "5710"
      ],
      "text/plain": [
       "[1] 5710"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "177"
      ],
      "text/latex": [
       "177"
      ],
      "text/markdown": [
       "177"
      ],
      "text/plain": [
       "[1] 177"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# separate the data into language vs. content assessment\n",
    "# remove apply: only 177 instances and only from one corpus\n",
    "\n",
    "nrow(combined[combined$type == \"language\" & combined$diffLevel == \"inference\",])\n",
    "nrow(combined[combined$type != \"language\" & combined$diffLevel == \"apply\",])\n",
    "\n",
    "combined_lang <- combined[combined$type == \"language\"]\n",
    "combined_lang <- droplevels(combined_lang)\n",
    "\n",
    "combined_cont <- combined[combined$type != \"language\" & combined$diffLevel != \"apply\",]\n",
    "combined_cont <- droplevels(combined_cont)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "# re-scale alnorm and qlnorm for lang and cont!\n",
    "\n",
    "combined_lang$alnorm <- scale(log(combined_lang$answerLength +1))\n",
    "combined_lang$qlnorm <- scale(log(combined_lang$questionLength +1))\n",
    "combined_cont$alnorm <- scale(log(combined_cont$answerLength +1))\n",
    "combined_cont$qlnorm <- scale(log(combined_cont$questionLength +1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "         1    2   16\n",
       "                    \n",
       "asap  9025    0    0\n",
       "cree     0    0  566\n",
       "creg     0 4384    0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# LANG: look at weights and corpus frequencies and adapt to new (filtered) corpus frequencies\n",
    "# ftable(combined_lang$corpus, combined_lang$weights)\n",
    "fasap = table(combined_lang$corpus)[\"asap\"]\n",
    "fcreg = table(combined_lang$corpus)[\"creg\"]\n",
    "fcree = table(combined_lang$corpus)[\"cree\"]\n",
    "combined_lang[combined_lang$corpus==\"cree\",]$weights = round(fasap/fcree)\n",
    "combined_lang[combined_lang$corpus==\"creg\",]$weights = round(fasap/fcreg)\n",
    "ftable(combined_lang$corpus, combined_lang$weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "            1    4   11\n",
       "                       \n",
       "asap     6904    0    0\n",
       "asap_de     0    0  602\n",
       "cssag       0 1591    0\n",
       "pg       6979    0    0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# CONT: look at weights and corpus frequencies and adapt to new (filtered) corpus frequencies\n",
    "# ftable(combined_cont$corpus, combined_cont$weights)\n",
    "fasap = table(combined_cont$corpus)[\"asap\"]\n",
    "fasapde = table(combined_cont$corpus)[\"asap_de\"]\n",
    "fcssag = table(combined_cont$corpus)[\"cssag\"]\n",
    "fpg = table(combined_cont$corpus)[\"pg\"]\n",
    "combined_cont[combined_cont$corpus==\"asap_de\",]$weights = round(fasap/fasapde)\n",
    "combined_cont[combined_cont$corpus==\"cssag\",]$weights = round(fasap/fcssag)\n",
    "combined_cont[combined_cont$corpus==\"pg\",]$weights = round(fasap/fpg)\n",
    "ftable(combined_cont$corpus, combined_cont$weights)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Joint model of LANGUAGE questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     questionID          studID      language   correctness    \n",
       " asap_3   :1891   cree_AU066:   47   en:9591   Min.   :0.0000  \n",
       " asap_7   :1799   cree_AU068:   47   de:4384   1st Qu.:0.0000  \n",
       " asap_8   :1799   cree_AU061:   46             Median :0.5000  \n",
       " asap_9   :1798   creg_110  :   44             Mean   :0.5527  \n",
       " asap_4   :1738   creg_220  :   42             3rd Qu.:1.0000  \n",
       " creg_2068:  97   creg_230  :   42             Max.   :1.0000  \n",
       " (Other)  :4853   (Other)   :13707                             \n",
       "     anno1            anno2         answerLength    questionLength \n",
       " Min.   :0.0000   Min.   :0.0000   Min.   :   3.0   Min.   : 19.0  \n",
       " 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  74.0   1st Qu.: 68.0  \n",
       " Median :0.5000   Median :0.5000   Median : 165.0   Median :132.0  \n",
       " Mean   :0.5527   Mean   :0.5771   Mean   : 196.5   Mean   :116.2  \n",
       " 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 283.0   3rd Qu.:165.0  \n",
       " Max.   :1.0000   Max.   :1.0000   Max.   :1819.0   Max.   :186.0  \n",
       "                                                                   \n",
       "       type                diffLevel         Sim           ans_homog      \n",
       " language:13975   literal       :4024   Min.   :0.0000   Min.   :0.09764  \n",
       "                  reorganization:4241   1st Qu.:0.2721   1st Qu.:0.30044  \n",
       "                  inference     :5710   Median :0.3356   Median :0.33450  \n",
       "                                        Mean   :0.3673   Mean   :0.36725  \n",
       "                                        3rd Qu.:0.4313   3rd Qu.:0.44244  \n",
       "                                        Max.   :1.0000   Max.   :0.96770  \n",
       "                                                                          \n",
       "        collection       agree         corpus        weights       correct_fac\n",
       " classroom   :4950   Min.   :0.0000   asap:9025   Min.   : 1.000   0:4452     \n",
       " standardized:9025   1st Qu.:1.0000   cree: 566   1st Qu.: 1.000   1:9523     \n",
       "                     Median :1.0000   creg:4384   Median : 1.000              \n",
       "                     Mean   :0.8415               Mean   : 1.921              \n",
       "                     3rd Qu.:1.0000               3rd Qu.: 2.000              \n",
       "                     Max.   :1.0000               Max.   :16.000              \n",
       "                                                                              \n",
       "     alnorm            qlnorm            relsim             simdevnorm       \n",
       " Min.   :-4.1142   Min.   :-2.9679   Min.   :-0.7053973   Min.   :-4.968074  \n",
       " 1st Qu.:-0.7393   1st Qu.:-0.7268   1st Qu.:-0.0484910   1st Qu.:-0.657816  \n",
       " Median : 0.1754   Median : 0.4609   Median :-0.0001856   Median :-0.002787  \n",
       " Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000000   Mean   : 0.000000  \n",
       " 3rd Qu.: 0.7937   3rd Qu.: 0.8620   3rd Qu.: 0.0542618   3rd Qu.: 0.646948  \n",
       " Max.   : 2.9325   Max.   : 1.0775   Max.   : 0.5588235   Max.   : 8.634394  \n",
       "                                                                             \n",
       "  simCat    \n",
       " low :4277  \n",
       " mid :5392  \n",
       " high:4306  \n",
       "            \n",
       "            \n",
       "            \n",
       "            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(combined_lang)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_lang <- bglmer(agree  ~\n",
    "                     alnorm +                    \n",
    "                      simCat +\n",
    "                     ans_homog +\n",
    "                      diffLevel * correct_fac + \n",
    "                     (1|corpus) +\n",
    "#                     (1|collection) +                              \n",
    "                     (1|questionID) +\n",
    "                     (1|studID),\n",
    "                  data = combined_lang,\n",
    "                  weights = weights,\n",
    "                  family=\"binomial\",\n",
    "                  control = glmerControl(optimizer = c(\"Nelder_Mead\",\"bobyqa\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(combinedmodel_lang)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>1.473941</td><td>1       </td><td>1.214060</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.383743</td><td>2       </td><td>1.084586</td></tr>\n",
       "\t<tr><th scope=row>ans_homog</th><td>1.044820</td><td>1       </td><td>1.022164</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>1.074765</td><td>2       </td><td>1.018189</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>3.360965</td><td>1       </td><td>1.833293</td></tr>\n",
       "\t<tr><th scope=row>diffLevel:correct_fac</th><td>3.242972</td><td>2       </td><td>1.341948</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 1.473941 & 1        & 1.214060\\\\\n",
       "\tsimCat & 1.383743 & 2        & 1.084586\\\\\n",
       "\tans\\_homog & 1.044820 & 1        & 1.022164\\\\\n",
       "\tdiffLevel & 1.074765 & 2        & 1.018189\\\\\n",
       "\tcorrect\\_fac & 3.360965 & 1        & 1.833293\\\\\n",
       "\tdiffLevel:correct\\_fac & 3.242972 & 2        & 1.341948\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|---|---|\n",
       "| alnorm | 1.473941 | 1        | 1.214060 | \n",
       "| simCat | 1.383743 | 2        | 1.084586 | \n",
       "| ans_homog | 1.044820 | 1        | 1.022164 | \n",
       "| diffLevel | 1.074765 | 2        | 1.018189 | \n",
       "| correct_fac | 3.360965 | 1        | 1.833293 | \n",
       "| diffLevel:correct_fac | 3.242972 | 2        | 1.341948 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                      GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm                1.473941 1  1.214060       \n",
       "simCat                1.383743 2  1.084586       \n",
       "ans_homog             1.044820 1  1.022164       \n",
       "diffLevel             1.074765 2  1.018189       \n",
       "correct_fac           3.360965 1  1.833293       \n",
       "diffLevel:correct_fac 3.242972 2  1.341948       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "car::vif(combinedmodel_lang)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Collinearity: GVIF < 2, all OK."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : corpus ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : 0.259\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ alnorm + simCat + ans_homog + diffLevel * correct_fac +  \n",
       "    (1 | corpus) + (1 | questionID) + (1 | studID)\n",
       "   Data: combined_lang\n",
       "Weights: weights\n",
       "Control: glmerControl(optimizer = c(\"Nelder_Mead\", \"bobyqa\"))\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       " 17319.6  17417.6  -8646.8  17293.6    13962 \n",
       "\n",
       "Scaled residuals: \n",
       "     Min       1Q   Median       3Q      Max \n",
       "-29.2186   0.1616   0.3561   0.4767  30.9673 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 0.5487   0.7408  \n",
       " questionID (Intercept) 4.8270   2.1970  \n",
       " corpus     (Intercept) 0.3177   0.5636  \n",
       "Number of obs: 13975, groups:  studID, 9435; questionID, 229; corpus, 3\n",
       "\n",
       "Fixed effects:\n",
       "                                     Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                           0.58856    0.62973   0.935 0.349982    \n",
       "alnorm                               -0.37664    0.04369  -8.621  < 2e-16 ***\n",
       "simCatmid                            -0.19330    0.05297  -3.649 0.000263 ***\n",
       "simCathigh                           -0.21886    0.06257  -3.498 0.000469 ***\n",
       "ans_homog                             1.39991    0.99383   1.409 0.158951    \n",
       "diffLevelreorganization               1.57394    0.49771   3.162 0.001565 ** \n",
       "diffLevelinference                    2.24076    0.53978   4.151 3.31e-05 ***\n",
       "correct_fac1                          3.13551    0.09029  34.726  < 2e-16 ***\n",
       "diffLevelreorganization:correct_fac1 -2.48858    0.12059 -20.637  < 2e-16 ***\n",
       "diffLevelinference:correct_fac1      -3.24549    0.12645 -25.667  < 2e-16 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg ans_hm dffLvlr dffLvln crrc_1\n",
       "alnorm       0.025                                                   \n",
       "simCatmid   -0.028  0.273                                            \n",
       "simCathigh  -0.016  0.500  0.567                                     \n",
       "ans_homog   -0.708  0.025  0.016  0.020                              \n",
       "dffLvlrrgnz -0.257 -0.010 -0.017 -0.022  0.154                       \n",
       "dffLvlnfrnc -0.258  0.004 -0.020 -0.019  0.129  0.169                \n",
       "correct_fc1 -0.004 -0.268 -0.211 -0.263 -0.073  0.072   0.069        \n",
       "dffLvlrr:_1  0.001  0.110  0.140  0.186  0.051 -0.121  -0.052  -0.722\n",
       "dffLvlnf:_1 -0.002  0.020  0.126  0.134  0.047 -0.052  -0.134  -0.688\n",
       "            dffLvlr:_1\n",
       "alnorm                \n",
       "simCatmid             \n",
       "simCathigh            \n",
       "ans_homog             \n",
       "dffLvlrrgnz           \n",
       "dffLvlnfrnc           \n",
       "correct_fc1           \n",
       "dffLvlrr:_1           \n",
       "dffLvlnf:_1  0.511    "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(combinedmodel_lang)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now inspect random effects for corpora"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>(Intercept)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>asap</th><td>-0.2609264</td></tr>\n",
       "\t<tr><th scope=row>cree</th><td> 0.1967596</td></tr>\n",
       "\t<tr><th scope=row>creg</th><td>-0.4937453</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|l}\n",
       "  & (Intercept)\\\\\n",
       "\\hline\n",
       "\tasap & -0.2609264\\\\\n",
       "\tcree &  0.1967596\\\\\n",
       "\tcreg & -0.4937453\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | (Intercept) | \n",
       "|---|---|---|\n",
       "| asap | -0.2609264 | \n",
       "| cree |  0.1967596 | \n",
       "| creg | -0.4937453 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "     (Intercept)\n",
       "asap -0.2609264 \n",
       "cree  0.1967596 \n",
       "creg -0.4937453 "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ranef(combinedmodel_lang)$corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, test significance of random effects by ratio test (removing first corpus, then question, then student)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_lang1 <- bglmer(agree  ~\n",
    "                     alnorm +                    \n",
    "                      simCat +\n",
    "                     ans_homog +\n",
    "                      diffLevel * correct_fac + \n",
    "#                     (1|corpus) +\n",
    "#                     (1|collection) +                              \n",
    "                     (1|questionID) +\n",
    "                     (1|studID),\n",
    "                  data = combined_lang,\n",
    "                  weights = weights,\n",
    "                  family=\"binomial\",\n",
    "                  control = glmerControl(optimizer = c(\"Nelder_Mead\",\"bobyqa\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_lang2 <- bglmer(agree  ~\n",
    "                     alnorm +                    \n",
    "                      simCat +\n",
    "                     ans_homog +\n",
    "                      diffLevel * correct_fac + \n",
    "#                     (1|corpus) +\n",
    "#                     (1|collection) +                              \n",
    "#                     (1|questionID) +\n",
    "                     (1|studID),\n",
    "                  data = combined_lang,\n",
    "                  weights = weights,\n",
    "                  family=\"binomial\",\n",
    "                  control = glmerControl(optimizer = c(\"Nelder_Mead\",\"bobyqa\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_lang3 <- glm(agree  ~\n",
    "                     alnorm +                    \n",
    "                      simCat +\n",
    "                     ans_homog +\n",
    "                      diffLevel * correct_fac, \n",
    "#                     (1|corpus) +\n",
    "#                     (1|collection) +                              \n",
    "#                     (1|questionID) +\n",
    "#                     (1|studID),\n",
    "                  data = combined_lang,\n",
    "                  weights = weights,\n",
    "                  family=\"binomial\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>combinedmodel_lang3</th><td>10          </td><td>20517.50    </td><td>20592.95    </td><td>-10248.750  </td><td>20497.50    </td><td>       NA   </td><td>NA          </td><td>          NA</td></tr>\n",
       "\t<tr><th scope=row>combinedmodel_lang2</th><td>11          </td><td>20101.93    </td><td>20184.93    </td><td>-10039.966  </td><td>20079.93    </td><td> 417.5665   </td><td> 1          </td><td>8.262897e-93</td></tr>\n",
       "\t<tr><th scope=row>combinedmodel_lang1</th><td>12          </td><td>17313.17    </td><td>17403.71    </td><td> -8644.586  </td><td>17289.17    </td><td>2790.7601   </td><td> 1          </td><td>0.000000e+00</td></tr>\n",
       "\t<tr><th scope=row>combinedmodel_lang</th><td>13          </td><td>17319.55    </td><td>17417.64    </td><td> -8646.776  </td><td>17293.55    </td><td>   0.0000   </td><td> 1          </td><td>1.000000e+00</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tcombinedmodel\\_lang3 & 10           & 20517.50     & 20592.95     & -10248.750   & 20497.50     &        NA    & NA           &           NA\\\\\n",
       "\tcombinedmodel\\_lang2 & 11           & 20101.93     & 20184.93     & -10039.966   & 20079.93     &  417.5665    &  1           & 8.262897e-93\\\\\n",
       "\tcombinedmodel\\_lang1 & 12           & 17313.17     & 17403.71     &  -8644.586   & 17289.17     & 2790.7601    &  1           & 0.000000e+00\\\\\n",
       "\tcombinedmodel\\_lang & 13           & 17319.55     & 17417.64     &  -8646.776   & 17293.55     &    0.0000    &  1           & 1.000000e+00\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|---|---|\n",
       "| combinedmodel_lang3 | 10           | 20517.50     | 20592.95     | -10248.750   | 20497.50     |        NA    | NA           |           NA | \n",
       "| combinedmodel_lang2 | 11           | 20101.93     | 20184.93     | -10039.966   | 20079.93     |  417.5665    |  1           | 8.262897e-93 | \n",
       "| combinedmodel_lang1 | 12           | 17313.17     | 17403.71     |  -8644.586   | 17289.17     | 2790.7601    |  1           | 0.000000e+00 | \n",
       "| combinedmodel_lang | 13           | 17319.55     | 17417.64     |  -8646.776   | 17293.55     |    0.0000    |  1           | 1.000000e+00 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                    Df AIC      BIC      logLik     deviance Chisq     Chi Df\n",
       "combinedmodel_lang3 10 20517.50 20592.95 -10248.750 20497.50        NA NA    \n",
       "combinedmodel_lang2 11 20101.93 20184.93 -10039.966 20079.93  417.5665  1    \n",
       "combinedmodel_lang1 12 17313.17 17403.71  -8644.586 17289.17 2790.7601  1    \n",
       "combinedmodel_lang  13 17319.55 17417.64  -8646.776 17293.55    0.0000  1    \n",
       "                    Pr(>Chisq)  \n",
       "combinedmodel_lang3           NA\n",
       "combinedmodel_lang2 8.262897e-93\n",
       "combinedmodel_lang1 0.000000e+00\n",
       "combinedmodel_lang  1.000000e+00"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(combinedmodel_lang, combinedmodel_lang1, combinedmodel_lang2, combinedmodel_lang3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Corpus not significant for LA, all others are"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Joint model of CONTENT questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   questionID        studID      language    correctness         anno1       \n",
       " asap_6 :1797   cssag_1 :   22   en:13883   Min.   :0.0000   Min.   :0.0000  \n",
       " asap_5 :1795   cssag_14:   22   de: 2193   1st Qu.:0.0000   1st Qu.:0.0000  \n",
       " asap_1 :1672   cssag_18:   22              Median :0.6667   Median :0.6667  \n",
       " asap_10:1640   cssag_19:   22              Mean   :0.5639   Mean   :0.5626  \n",
       " pg_1   : 698   cssag_34:   22              3rd Qu.:1.0000   3rd Qu.:1.0000  \n",
       " pg_13  : 698   cssag_38:   22              Max.   :1.0000   Max.   :1.0000  \n",
       " (Other):7776   (Other) :15944                                               \n",
       "     anno2         answerLength    questionLength        type      \n",
       " Min.   :0.0000   Min.   :   1.0   Min.   :  32.0   content:16076  \n",
       " 1st Qu.:0.0000   1st Qu.:  19.0   1st Qu.:  63.0                  \n",
       " Median :0.6667   Median :  56.0   Median : 105.0                  \n",
       " Mean   :0.5687   Mean   : 111.5   Mean   : 327.9                  \n",
       " 3rd Qu.:1.0000   3rd Qu.: 168.0   3rd Qu.: 225.0                  \n",
       " Max.   :1.0000   Max.   :1477.0   Max.   :1728.0                  \n",
       "                                                                   \n",
       "      diffLevel          Sim           ans_homog             collection  \n",
       " remember  :11561   Min.   :0.0000   Min.   :0.1773   classroom   :1591  \n",
       " understand: 4515   1st Qu.:0.2056   1st Qu.:0.2203   research    :7581  \n",
       "                    Median :0.3202   Median :0.3505   standardized:6904  \n",
       "                    Mean   :0.3622   Mean   :0.3622                      \n",
       "                    3rd Qu.:0.4776   3rd Qu.:0.4027                      \n",
       "                    Max.   :0.9429   Max.   :0.8448                      \n",
       "                                                                         \n",
       "     agree            corpus        weights       correct_fac\n",
       " Min.   :0.0000   asap   :6904   Min.   : 1.000   0:6780     \n",
       " 1st Qu.:1.0000   asap_de: 602   1st Qu.: 1.000   1:9296     \n",
       " Median :1.0000   cssag  :1591   Median : 1.000              \n",
       " Mean   :0.9392   pg     :6979   Mean   : 1.671              \n",
       " 3rd Qu.:1.0000                  3rd Qu.: 1.000              \n",
       " Max.   :1.0000                  Max.   :11.000              \n",
       "                                                             \n",
       "     alnorm             qlnorm            relsim            simdevnorm      \n",
       " Min.   :-2.82270   Min.   :-1.2366   Min.   :-0.844839   Min.   :-4.62647  \n",
       " 1st Qu.:-0.90117   1st Qu.:-0.6795   1st Qu.:-0.067211   1st Qu.:-0.64997  \n",
       " Median :-0.02717   Median :-0.2551   Median : 0.004266   Median : 0.04828  \n",
       " Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.000000   Mean   : 0.00000  \n",
       " 3rd Qu.: 0.87981   3rd Qu.: 0.3817   3rd Qu.: 0.092574   3rd Qu.: 0.70971  \n",
       " Max.   : 2.68949   Max.   : 2.0932   Max.   : 0.615997   Max.   : 6.50311  \n",
       "                                                                            \n",
       "  simCat    \n",
       " low :4759  \n",
       " mid :5942  \n",
       " high:5375  \n",
       "            \n",
       "            \n",
       "            \n",
       "            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(combined_cont)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_cont <- bglmer(agree  ~\n",
    "                     alnorm +                    \n",
    "                     simCat +\n",
    "                     ans_homog +\n",
    "                     diffLevel * correct_fac + \n",
    "                     (1|corpus) +\n",
    "                     (1|questionID) +\n",
    "#                     (1|collection) + \n",
    "                     (1|studID),\n",
    "                  data = combined_cont,\n",
    "                  weights = weights,\n",
    "                  family= \"binomial\",\n",
    "                  control = glmerControl(optimizer = c(\"Nelder_Mead\",\"bobyqa\"))) # , optCtrl=list(maxfun=10000)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "FALSE"
      ],
      "text/latex": [
       "FALSE"
      ],
      "text/markdown": [
       "FALSE"
      ],
      "text/plain": [
       "[1] FALSE"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "isSingular(combinedmodel_cont)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>GVIF</th><th scope=col>Df</th><th scope=col>GVIF^(1/(2*Df))</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>alnorm</th><td>1.502280</td><td>1       </td><td>1.225675</td></tr>\n",
       "\t<tr><th scope=row>simCat</th><td>1.465239</td><td>2       </td><td>1.100214</td></tr>\n",
       "\t<tr><th scope=row>ans_homog</th><td>1.000927</td><td>1       </td><td>1.000464</td></tr>\n",
       "\t<tr><th scope=row>diffLevel</th><td>1.006840</td><td>1       </td><td>1.003414</td></tr>\n",
       "\t<tr><th scope=row>correct_fac</th><td>1.688996</td><td>1       </td><td>1.299614</td></tr>\n",
       "\t<tr><th scope=row>diffLevel:correct_fac</th><td>1.555289</td><td>1       </td><td>1.247112</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|lll}\n",
       "  & GVIF & Df & GVIF\\textasciicircum{}(1/(2*Df))\\\\\n",
       "\\hline\n",
       "\talnorm & 1.502280 & 1        & 1.225675\\\\\n",
       "\tsimCat & 1.465239 & 2        & 1.100214\\\\\n",
       "\tans\\_homog & 1.000927 & 1        & 1.000464\\\\\n",
       "\tdiffLevel & 1.006840 & 1        & 1.003414\\\\\n",
       "\tcorrect\\_fac & 1.688996 & 1        & 1.299614\\\\\n",
       "\tdiffLevel:correct\\_fac & 1.555289 & 1        & 1.247112\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | GVIF | Df | GVIF^(1/(2*Df)) | \n",
       "|---|---|---|---|---|---|\n",
       "| alnorm | 1.502280 | 1        | 1.225675 | \n",
       "| simCat | 1.465239 | 2        | 1.100214 | \n",
       "| ans_homog | 1.000927 | 1        | 1.000464 | \n",
       "| diffLevel | 1.006840 | 1        | 1.003414 | \n",
       "| correct_fac | 1.688996 | 1        | 1.299614 | \n",
       "| diffLevel:correct_fac | 1.555289 | 1        | 1.247112 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                      GVIF     Df GVIF^(1/(2*Df))\n",
       "alnorm                1.502280 1  1.225675       \n",
       "simCat                1.465239 2  1.100214       \n",
       "ans_homog             1.000927 1  1.000464       \n",
       "diffLevel             1.006840 1  1.003414       \n",
       "correct_fac           1.688996 1  1.299614       \n",
       "diffLevel:correct_fac 1.555289 1  1.247112       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "car::vif(combinedmodel_cont)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Multicollinearity is fine."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cov prior  : studID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : questionID ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "           : corpus ~ wishart(df = 3.5, scale = Inf, posterior.scale = cov, common.scale = TRUE)\n",
       "Prior dev  : -8.2387\n",
       "\n",
       "Generalized linear mixed model fit by maximum likelihood (Laplace\n",
       "  Approximation) [bglmerMod]\n",
       " Family: binomial  ( logit )\n",
       "Formula: agree ~ alnorm + simCat + ans_homog + diffLevel * correct_fac +  \n",
       "    (1 | corpus) + (1 | questionID) + (1 | studID)\n",
       "   Data: combined_cont\n",
       "Weights: weights\n",
       "Control: glmerControl(optimizer = c(\"Nelder_Mead\", \"bobyqa\"))\n",
       "\n",
       "     AIC      BIC   logLik deviance df.resid \n",
       " 10033.3  10117.8  -5005.6  10011.3    16065 \n",
       "\n",
       "Scaled residuals: \n",
       "     Min       1Q   Median       3Q      Max \n",
       "-21.6521   0.0292   0.0569   0.1779   8.5668 \n",
       "\n",
       "Random effects:\n",
       " Groups     Name        Variance Std.Dev.\n",
       " studID     (Intercept) 12.730   3.568   \n",
       " questionID (Intercept)  3.819   1.954   \n",
       " corpus     (Intercept)  4.995   2.235   \n",
       "Number of obs: 16076, groups:  studID, 8481; questionID, 44; corpus, 4\n",
       "\n",
       "Fixed effects:\n",
       "                                 Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)                       5.70014    1.68093   3.391 0.000696 ***\n",
       "alnorm                           -0.43695    0.07854  -5.564 2.64e-08 ***\n",
       "simCatmid                         0.29115    0.08946   3.255 0.001136 ** \n",
       "simCathigh                        0.30333    0.10855   2.794 0.005201 ** \n",
       "ans_homog                         2.06692    3.36690   0.614 0.539287    \n",
       "diffLevelunderstand              -1.17066    0.72191  -1.622 0.104886    \n",
       "correct_fac1                      1.13266    0.10767  10.520  < 2e-16 ***\n",
       "diffLevelunderstand:correct_fac1 -0.85224    0.17258  -4.938 7.88e-07 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "Correlation of Fixed Effects:\n",
       "            (Intr) alnorm smCtmd smCthg ans_hm dffLvl crrc_1\n",
       "alnorm      -0.024                                          \n",
       "simCatmid   -0.018  0.307                                   \n",
       "simCathigh  -0.024  0.548  0.530                            \n",
       "ans_homog   -0.642 -0.001 -0.009  0.003                     \n",
       "dffLvlndrst -0.227  0.000 -0.004 -0.001  0.022              \n",
       "correct_fc1 -0.006 -0.281 -0.149 -0.223 -0.012  0.047       \n",
       "dffLvlnd:_1 -0.005  0.005  0.022 -0.036 -0.001 -0.079 -0.564"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "summary(combinedmodel_cont)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now inspect random effects for corpora"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>(Intercept)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>asap</th><td> 0.233283</td></tr>\n",
       "\t<tr><th scope=row>asap_de</th><td>-1.161782</td></tr>\n",
       "\t<tr><th scope=row>cssag</th><td>-3.868017</td></tr>\n",
       "\t<tr><th scope=row>pg</th><td>-3.360495</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|l}\n",
       "  & (Intercept)\\\\\n",
       "\\hline\n",
       "\tasap &  0.233283\\\\\n",
       "\tasap\\_de & -1.161782\\\\\n",
       "\tcssag & -3.868017\\\\\n",
       "\tpg & -3.360495\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | (Intercept) | \n",
       "|---|---|---|---|\n",
       "| asap |  0.233283 | \n",
       "| asap_de | -1.161782 | \n",
       "| cssag | -3.868017 | \n",
       "| pg | -3.360495 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "        (Intercept)\n",
       "asap     0.233283  \n",
       "asap_de -1.161782  \n",
       "cssag   -3.868017  \n",
       "pg      -3.360495  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ranef(combinedmodel_cont)$corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, test significance of random effects by ratio test (removing first corpus, then question, then student)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_cont1 <- bglmer(agree  ~\n",
    "                     alnorm +                    \n",
    "                     simCat +\n",
    "                     ans_homog +\n",
    "                     diffLevel * correct_fac + \n",
    "#                     (1|corpus) +\n",
    "                     (1|questionID) +\n",
    "#                     (1|collection) + \n",
    "                     (1|studID),\n",
    "                  data = combined_cont,\n",
    "                  weights = weights,\n",
    "                  family= \"binomial\",\n",
    "                  control = glmerControl(optimizer = c(\"Nelder_Mead\",\"bobyqa\"))) # , optCtrl=list(maxfun=10000)))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_cont2 <- bglmer(agree  ~\n",
    "                     alnorm +                    \n",
    "                     simCat +\n",
    "                     ans_homog +\n",
    "                     diffLevel * correct_fac + \n",
    "#                     (1|corpus) +\n",
    "#                     (1|questionID) +\n",
    "#                     (1|collection) + \n",
    "                     (1|studID),\n",
    "                  data = combined_cont,\n",
    "                  weights = weights,\n",
    "                  family= \"binomial\",\n",
    "                  control = glmerControl(optimizer = c(\"Nelder_Mead\",\"bobyqa\"))) # , optCtrl=list(maxfun=10000)))\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "combinedmodel_cont3 <- glm(agree  ~\n",
    "                     alnorm +                    \n",
    "                     simCat +\n",
    "                     ans_homog +\n",
    "                     diffLevel * correct_fac ,\n",
    "#                     (1|corpus) +\n",
    "#                     (1|questionID) +\n",
    "#                     (1|collection) + \n",
    "#                     (1|studID),\n",
    "                  data = combined_cont,\n",
    "                  weights = weights,\n",
    "                  family= \"binomial\") # , optCtrl=list(maxfun=10000)))\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th></th><th scope=col>Df</th><th scope=col>AIC</th><th scope=col>BIC</th><th scope=col>logLik</th><th scope=col>deviance</th><th scope=col>Chisq</th><th scope=col>Chi Df</th><th scope=col>Pr(&gt;Chisq)</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><th scope=row>combinedmodel_cont3</th><td> 8           </td><td>18831.60     </td><td>18893.08     </td><td>-9407.798    </td><td>18815.60     </td><td>        NA   </td><td>NA           </td><td>           NA</td></tr>\n",
       "\t<tr><th scope=row>combinedmodel_cont2</th><td> 9           </td><td>10537.66     </td><td>10606.83     </td><td>-5259.831    </td><td>10519.66     </td><td>8295.93484   </td><td> 1           </td><td> 0.000000e+00</td></tr>\n",
       "\t<tr><th scope=row>combinedmodel_cont1</th><td>10           </td><td>10043.82     </td><td>10120.67     </td><td>-5011.911    </td><td>10023.82     </td><td> 495.83989   </td><td> 1           </td><td>7.640696e-110</td></tr>\n",
       "\t<tr><th scope=row>combinedmodel_cont</th><td>11           </td><td>10033.26     </td><td>10117.79     </td><td>-5005.629    </td><td>10011.26     </td><td>  12.56332   </td><td> 1           </td><td> 3.933931e-04</td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       "  & Df & AIC & BIC & logLik & deviance & Chisq & Chi Df & Pr(>Chisq)\\\\\n",
       "\\hline\n",
       "\tcombinedmodel\\_cont3 &  8            & 18831.60      & 18893.08      & -9407.798     & 18815.60      &         NA    & NA            &            NA\\\\\n",
       "\tcombinedmodel\\_cont2 &  9            & 10537.66      & 10606.83      & -5259.831     & 10519.66      & 8295.93484    &  1            &  0.000000e+00\\\\\n",
       "\tcombinedmodel\\_cont1 & 10            & 10043.82      & 10120.67      & -5011.911     & 10023.82      &  495.83989    &  1            & 7.640696e-110\\\\\n",
       "\tcombinedmodel\\_cont & 11            & 10033.26      & 10117.79      & -5005.629     & 10011.26      &   12.56332    &  1            &  3.933931e-04\\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "| <!--/--> | Df | AIC | BIC | logLik | deviance | Chisq | Chi Df | Pr(>Chisq) | \n",
       "|---|---|---|---|\n",
       "| combinedmodel_cont3 |  8            | 18831.60      | 18893.08      | -9407.798     | 18815.60      |         NA    | NA            |            NA | \n",
       "| combinedmodel_cont2 |  9            | 10537.66      | 10606.83      | -5259.831     | 10519.66      | 8295.93484    |  1            |  0.000000e+00 | \n",
       "| combinedmodel_cont1 | 10            | 10043.82      | 10120.67      | -5011.911     | 10023.82      |  495.83989    |  1            | 7.640696e-110 | \n",
       "| combinedmodel_cont | 11            | 10033.26      | 10117.79      | -5005.629     | 10011.26      |   12.56332    |  1            |  3.933931e-04 | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "                    Df AIC      BIC      logLik    deviance Chisq      Chi Df\n",
       "combinedmodel_cont3  8 18831.60 18893.08 -9407.798 18815.60         NA NA    \n",
       "combinedmodel_cont2  9 10537.66 10606.83 -5259.831 10519.66 8295.93484  1    \n",
       "combinedmodel_cont1 10 10043.82 10120.67 -5011.911 10023.82  495.83989  1    \n",
       "combinedmodel_cont  11 10033.26 10117.79 -5005.629 10011.26   12.56332  1    \n",
       "                    Pr(>Chisq)   \n",
       "combinedmodel_cont3            NA\n",
       "combinedmodel_cont2  0.000000e+00\n",
       "combinedmodel_cont1 7.640696e-110\n",
       "combinedmodel_cont   3.933931e-04"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "anova(combinedmodel_cont, combinedmodel_cont1, combinedmodel_cont2, combinedmodel_cont3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All highly significant."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "3.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}