greenelab · nrosed · Mar 8, 2021 · Feb 4, 2021 · Feb 5, 2021 · Feb 5, 2021
diff --git a/analysis_scripts/analysis_utils.R b/analysis_scripts/analysis_utils.R
@@ -2,6 +2,12 @@ library(jsonlite)
 library(data.table)
 library(dplyr)
 
+
+proj_dir = here()
+source(paste(proj_dir, "/utils/scraper_processing_utils.R", sep=""))
+
+
+
 read_corenlp_quote_files <- function(corenlp_file){
 
     corenlp_df = data.frame(fread(corenlp_file, header=T, quote=""))
@@ -32,10 +38,16 @@ read_benchmark_quote_file <- function(gold_file){
 read_corenlp_location_files <- function(corenlp_file){
 
     corenlp_df = data.frame(fread(corenlp_file, header=T))
+    country_df = get_country_info()
+    corenlp_df = merge(corenlp_df, country_df, all.x=T)
+    corenlp_df = unique(corenlp_df)
+
+    colnames(corenlp_df)[which(colnames(corenlp_df)=="address.country_code")] = "est_country_code"
     colnames(corenlp_df)[which(colnames(corenlp_df)=="country")] = "est_country"
     colnames(corenlp_df)[which(colnames(corenlp_df)=="un_region")] = "est_un_region"
     colnames(corenlp_df)[which(colnames(corenlp_df)=="un_subregion")] = "est_un_subregion"
 
+    corenlp_df$est_country[which(is.na(corenlp_df$est_country_code))] = "NO_EST"
     corenlp_df$est_country[which(is.na(corenlp_df$est_country))] = "NO_EST"
     corenlp_df$est_un_region[which(is.na(corenlp_df$est_un_region))] = "NO_EST"
     corenlp_df$est_un_subregion[which(is.na(corenlp_df$est_un_subregion))] = "NO_EST"
@@ -47,6 +59,7 @@ read_corenlp_location_files <- function(corenlp_file){
 read_benchmark_location_file <- function(bm_loc_file){
 
     gold_df = data.frame(fread(bm_loc_file, header=T))
+    colnames(gold_df)[which(colnames(gold_df)=="address.country_code")] = "true_country_code"
     colnames(gold_df)[which(colnames(gold_df)=="country")] = "true_country"
     colnames(gold_df)[which(colnames(gold_df)=="un_region")] = "true_un_region"
     colnames(gold_df)[which(colnames(gold_df)=="un_subregion")] = "true_un_subregion"

diff --git a/analysis_scripts/analyze_benchmark_data/benchmark_analysis.Rmd b/analysis_scripts/analyze_benchmark_data/benchmark_analysis.Rmd
@@ -12,6 +12,7 @@ require(data.table)
 require(here)
 require(ggplot2)
 require(caret)
+require(ggrepel)
 
 proj_dir = here()
 source(paste(proj_dir, "/analysis_scripts/analysis_utils.R", sep=""))
@@ -167,7 +168,7 @@ bm_loc_file = paste(proj_dir,
 bm_loc_df = read_benchmark_location_file(bm_loc_file)
 
 raw_loc_file = paste(proj_dir, 
-                    "/data/benchmark_data/benchmark_location_table_hand_annotated.tsv", 
+                    "/data/benchmark_data/benchmark_location_table_raw.tsv", 
                     sep="")
 
 raw_loc_df = read_corenlp_location_files(raw_loc_file)
@@ -191,33 +192,36 @@ Now lets first look at the benchmark data
 
     # filter out any places where the gender is NA
     # this can happen when a quote is from an unidentified i.e. spokesperson
-    eval_df = subset(bm_loc_df, !is.na(true_country))
+    eval_df = subset(bm_loc_df, true_country_code != "NAN")
 
     eval_df = merge(year_df, eval_df)
 
     # we only care if a country was mentioned once or not at all
-    eval_df = eval_df[,c("file_id", "true_country", "true_un_region", 
+    eval_df = eval_df[,c("file_id", "true_country_code", "true_un_region", 
                          "true_un_subregion", "year")]
 
     eval_df = unique(eval_df)
 
     ## plot per year stats
-    country_agg = data.frame(table(eval_df[,c("true_country", "year")]))
-    ggplot(country_agg, aes(x=year, y=Freq, color=true_country, group=true_country)) +
+    country_agg = unique(eval_df[,c("file_id","true_country_code", "year")])
+    country_agg = data.frame(table(country_agg[,c("true_country_code", "year")]))
+    ggplot(country_agg, aes(x=year, y=Freq, color=true_country_code, group=true_country_code)) +
         geom_line() + geom_point() + theme_bw() + 
         xlab("Year of Article") + 
         ylab("Number of Articles (10 articles/year) with \n at least one Country Mention") +
         ylim(c(0, 10)) +
         ggtitle("Country Mention by Year")
 
-    subregion_agg = data.frame(table(eval_df[,c("true_un_subregion", "year")]))
+    subregion_agg = unique(eval_df[,c("file_id","true_un_subregion", "year")])
+    subregion_agg = data.frame(table(subregion_agg[,c("true_un_subregion", "year")]))
     ggplot(subregion_agg, aes(x=year, y=Freq, color=true_un_subregion, group=true_un_subregion)) +
         geom_line() + geom_point() + theme_bw() + 
         xlab("Year of Article") + 
         ylab("Number of Articles (10 articles/year) with \n at least one UN Subregion Mention") +
         ggtitle("Subregion Mention by Year")
 
-    region_agg = data.frame(table(eval_df[,c("true_un_region", "year")]))
+    region_agg = unique(eval_df[,c("file_id","true_un_region", "year")])
+    region_agg = data.frame(table(region_agg[,c("true_un_region", "year")]))
     ggplot(region_agg, aes(x=year, y=Freq, color=true_un_region, group=true_un_region)) +
         geom_line() + geom_point() + theme_bw() + 
         xlab("Year of Article") + 
@@ -226,4 +230,169 @@ Now lets first look at the benchmark data
 
 ```
 
+Ok, so we see a strong signal that US/Americas/Europe are mentioned at a much 
+higher rate than other regions.
+We would like to also see this pattern in our predicted locations, but first we 
+need to show that our estimations are accurate.
+Shown below are now analyses comparing our hand-annotated benchmark data
+against the fully-automated processed data.
+We would like to show that the true number of articles with a region mention,
+is highly correlated to the estimated number of articles from our full pipeline.
+
+
+First let's take a look at the prediction errors for UN Subregions
+
+```{r  fig.align='center', fig.width = 15, fig.height = 15, echo=FALSE, warning=FALSE, message=F}
+
+
+    # join the df to make comparison
+    bm_loc_df$text = tolower(bm_loc_df$text)
+    raw_loc_df$text = tolower(raw_loc_df$text)
+    compare_df = merge(bm_loc_df, raw_loc_df, by=c("file_id", "text"), all.x=T)
+
+    # now we only count ONCE per article
+    compare_df = subset(compare_df, select = -c(text))
+    compare_df = unique(compare_df)
+
+    #compare per country
+    country_idx = which(colnames(compare_df) == "est_un_subregion")
+    true_country_idx = which(colnames(compare_df) == "true_un_subregion")
+    country_match = apply(compare_df, 1, 
+                        function(x) x[country_idx] == x[true_country_idx]) 
+
+    compare_df$is_country_correct = country_match
+
+    # write out confusion tables
+
+    # first need to format the levels
+    compare_df$est_un_subregion = as.factor(compare_df$est_un_subregion)
+    compare_df$true_un_subregion = as.factor(compare_df$true_un_subregion)
+
+    all_levels = unique(c(levels(compare_df$est_un_subregion), 
+                          levels(compare_df$true_un_subregion)))
+
+    missing_levels = setdiff(all_levels, levels(compare_df$est_un_subregion))
+    levels(compare_df$est_un_subregion) = 
+        c(levels(compare_df$est_un_subregion), missing_levels)
+
+    missing_levels = setdiff(all_levels, levels(compare_df$true_un_subregion))
+    levels(compare_df$true_un_subregion) = 
+        c(levels(compare_df$true_un_subregion), missing_levels)
+
+    confusion_matrix <- confusionMatrix(compare_df$est_un_subregion, 
+                                        compare_df$true_un_subregion)
+    gg_conf = prettyConfused(compare_df$true_un_subregion, compare_df$est_un_subregion, text.scl = 5)
+    gg_conf = gg_conf + ggtitle(paste("UN Subregion Prediction Kappa:", 
+                                      round(confusion_matrix$overall['Kappa'], 4))) +
+              theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
+
+    gg_conf
+
+
+
+```
+
+We find that there exist errors in the prediction, but it is not completely off.
+We would like to verify that our hand annotation and pipeline results
+are at least strongly correlated.
 
+
+```{r  echo=FALSE,  out.width="50%", warning=FALSE, message=F}
+
+    pred_freq = as.data.frame(table(compare_df$est_country_code))
+    colnames(pred_freq) = c("country", "Pred_Freq")
+
+    true_freq = as.data.frame(table(compare_df$true_country_code))
+    colnames(true_freq) = c("country", "True_Freq")
+
+    freq_df = merge(pred_freq, true_freq, all=T)
+    freq_df = subset(freq_df, !country %in% c("NOT_COUNTRY", "NOT_FOUND"))
+
+    gg_corr_all = ggplot(freq_df, aes(x=Pred_Freq, y=True_Freq, label=country)) +
+                geom_point() + geom_abline(intercept = 0, slope = 1) +
+                theme_bw() + geom_text_repel() +
+                xlab("Predicted Frequency") + 
+                ylab("True Frequency") +
+                ggtitle("Pred. vs. True Country Frequencies")
+
+     gg_corr_subset = ggplot(subset(freq_df, Pred_Freq < 20),
+                             aes(x=Pred_Freq, y=True_Freq, label=country)) +
+                geom_point() + geom_abline(intercept = 0, slope = 1) +
+                theme_bw() + geom_text_repel() +
+                xlab("Predicted Frequency") + 
+                ylab("True Frequency") +
+                ggtitle("Excluding top 2: Pred. vs. True Country Frequencies")
+
+       gg_corr_all    
+       gg_corr_subset
+
+```
+Let's look at if subregions is any better/worse:
+
+```{r  echo=FALSE,  out.width="50%", warning=FALSE, message=F}
+
+    pred_freq = as.data.frame(table(compare_df$est_un_subregion))
+    colnames(pred_freq) = c("un_subregion", "Pred_Freq")
+
+    true_freq = as.data.frame(table(compare_df$true_un_subregion))
+    colnames(true_freq) = c("un_subregion", "True_Freq")
+
+    freq_df = merge(pred_freq, true_freq, all=T)
+    freq_df = subset(freq_df, un_subregion != "NO_EST")
+    freq_df[is.na(freq_df)] = 0
+
+    gg_corr_all = ggplot(freq_df, aes(x=Pred_Freq, y=True_Freq, label=un_subregion)) +
+                geom_point() + geom_abline(intercept = 0, slope = 1) +
+                theme_bw() + geom_text_repel() +
+                xlab("Predicted Frequency") + 
+                ylab("True Frequency") +
+                ggtitle("Pred. vs. True UN Subregion Frequencies")
+
+     gg_corr_subset = ggplot(subset(freq_df, Pred_Freq < 60),
+                             aes(x=Pred_Freq, y=True_Freq, label=un_subregion)) +
+                geom_point() + geom_abline(intercept = 0, slope = 1) +
+                theme_bw() + geom_text_repel() +
+                xlab("Predicted Frequency") + 
+                ylab("True Frequency") +
+                ggtitle("Excluding top 1: Pred. vs. True  UN Subregion Frequencies")
+
+       gg_corr_all    
+       gg_corr_subset
+
+```
+
+Now, finally large regions:
+
+```{r  echo=FALSE,  out.width="50%", warning=FALSE, message=F}
+
+    pred_freq = as.data.frame(table(compare_df$est_un_region))
+    colnames(pred_freq) = c("un_region", "Pred_Freq")
+
+    true_freq = as.data.frame(table(compare_df$true_un_region))
+    colnames(true_freq) = c("un_region", "True_Freq")
+
+    freq_df = merge(pred_freq, true_freq, all=T)
+    freq_df = subset(freq_df, un_region != "NO_EST")
+    freq_df[is.na(freq_df)] = 0
+
+    gg_corr_all = ggplot(freq_df, aes(x=Pred_Freq, y=True_Freq, label=un_region)) +
+                geom_point() + geom_abline(intercept = 0, slope = 1) +
+                theme_bw() + geom_text_repel() +
+                xlab("Predicted Frequency") + 
+                ylab("True Frequency") +
+                xlim(c(0,110)) + ylim(c(0,110)) + 
+                ggtitle("Pred. vs. True UN Region Frequencies")
+
+     gg_corr_subset = ggplot(subset(freq_df, Pred_Freq < 60),
+                             aes(x=Pred_Freq, y=True_Freq, label=un_region)) +
+                geom_point() + geom_abline(intercept = 0, slope = 1) +
+                theme_bw() + geom_text_repel() +
+                xlab("Predicted Frequency") + 
+                ylab("True Frequency") +
+                xlim(c(0,20)) + ylim(c(0,20)) + 
+                ggtitle("Excluding top Europe+Americas: Pred. vs. True  UN Region Frequencies")
+
+       gg_corr_all    
+       gg_corr_subset
+
+```