Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated location prediction to openstreetmap #7

Merged
merged 13 commits into from
Mar 8, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions analysis_scripts/analysis_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@ library(jsonlite)
library(data.table)
library(dplyr)


proj_dir = here()
source(paste(proj_dir, "/utils/scraper_processing_utils.R", sep=""))
nrosed marked this conversation as resolved.
Show resolved Hide resolved



read_corenlp_quote_files <- function(corenlp_file){

corenlp_df = data.frame(fread(corenlp_file, header=T, quote=""))
Expand Down Expand Up @@ -32,10 +38,16 @@ read_benchmark_quote_file <- function(gold_file){
read_corenlp_location_files <- function(corenlp_file){

corenlp_df = data.frame(fread(corenlp_file, header=T))
country_df = get_country_info()
corenlp_df = merge(corenlp_df, country_df, all.x=T)
corenlp_df = unique(corenlp_df)

colnames(corenlp_df)[which(colnames(corenlp_df)=="address.country_code")] = "est_country_code"
colnames(corenlp_df)[which(colnames(corenlp_df)=="country")] = "est_country"
colnames(corenlp_df)[which(colnames(corenlp_df)=="un_region")] = "est_un_region"
colnames(corenlp_df)[which(colnames(corenlp_df)=="un_subregion")] = "est_un_subregion"

corenlp_df$est_country[which(is.na(corenlp_df$est_country_code))] = "NO_EST"
corenlp_df$est_country[which(is.na(corenlp_df$est_country))] = "NO_EST"
corenlp_df$est_un_region[which(is.na(corenlp_df$est_un_region))] = "NO_EST"
corenlp_df$est_un_subregion[which(is.na(corenlp_df$est_un_subregion))] = "NO_EST"
Expand All @@ -47,6 +59,7 @@ read_corenlp_location_files <- function(corenlp_file){
read_benchmark_location_file <- function(bm_loc_file){

gold_df = data.frame(fread(bm_loc_file, header=T))
colnames(gold_df)[which(colnames(gold_df)=="address.country_code")] = "true_country_code"
colnames(gold_df)[which(colnames(gold_df)=="country")] = "true_country"
colnames(gold_df)[which(colnames(gold_df)=="un_region")] = "true_un_region"
colnames(gold_df)[which(colnames(gold_df)=="un_subregion")] = "true_un_subregion"
Expand Down
183 changes: 176 additions & 7 deletions analysis_scripts/analyze_benchmark_data/benchmark_analysis.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require(data.table)
require(here)
require(ggplot2)
require(caret)
require(ggrepel)

proj_dir = here()
source(paste(proj_dir, "/analysis_scripts/analysis_utils.R", sep=""))
Expand Down Expand Up @@ -167,7 +168,7 @@ bm_loc_file = paste(proj_dir,
bm_loc_df = read_benchmark_location_file(bm_loc_file)

raw_loc_file = paste(proj_dir,
"/data/benchmark_data/benchmark_location_table_hand_annotated.tsv",
"/data/benchmark_data/benchmark_location_table_raw.tsv",
sep="")

raw_loc_df = read_corenlp_location_files(raw_loc_file)
Expand All @@ -191,33 +192,36 @@ Now lets first look at the benchmark data

# filter out any places where the gender is NA
# this can happen when a quote is from an unidentified i.e. spokesperson
eval_df = subset(bm_loc_df, !is.na(true_country))
eval_df = subset(bm_loc_df, true_country_code != "NAN")

eval_df = merge(year_df, eval_df)

# we only care if a country was mentioned once or not at all
eval_df = eval_df[,c("file_id", "true_country", "true_un_region",
eval_df = eval_df[,c("file_id", "true_country_code", "true_un_region",
"true_un_subregion", "year")]

eval_df = unique(eval_df)

## plot per year stats
country_agg = data.frame(table(eval_df[,c("true_country", "year")]))
ggplot(country_agg, aes(x=year, y=Freq, color=true_country, group=true_country)) +
country_agg = unique(eval_df[,c("file_id","true_country_code", "year")])
country_agg = data.frame(table(country_agg[,c("true_country_code", "year")]))
ggplot(country_agg, aes(x=year, y=Freq, color=true_country_code, group=true_country_code)) +
geom_line() + geom_point() + theme_bw() +
xlab("Year of Article") +
ylab("Number of Articles (10 articles/year) with \n at least one Country Mention") +
ylim(c(0, 10)) +
ggtitle("Country Mention by Year")

subregion_agg = data.frame(table(eval_df[,c("true_un_subregion", "year")]))
subregion_agg = unique(eval_df[,c("file_id","true_un_subregion", "year")])
subregion_agg = data.frame(table(subregion_agg[,c("true_un_subregion", "year")]))
ggplot(subregion_agg, aes(x=year, y=Freq, color=true_un_subregion, group=true_un_subregion)) +
geom_line() + geom_point() + theme_bw() +
xlab("Year of Article") +
ylab("Number of Articles (10 articles/year) with \n at least one UN Subregion Mention") +
ggtitle("Subregion Mention by Year")

region_agg = data.frame(table(eval_df[,c("true_un_region", "year")]))
region_agg = unique(eval_df[,c("file_id","true_un_region", "year")])
region_agg = data.frame(table(region_agg[,c("true_un_region", "year")]))
ggplot(region_agg, aes(x=year, y=Freq, color=true_un_region, group=true_un_region)) +
geom_line() + geom_point() + theme_bw() +
xlab("Year of Article") +
Expand All @@ -226,4 +230,169 @@ Now lets first look at the benchmark data

```

Ok, so we see a strong signal that US/Americas/Europe are mentioned at a much
higher rate than other regions.
We would like to also see this pattern in our predicted locations, but first we
need to show that our estimations are accurate.
Shown below are now analyses comparing our hand-annotated benchmark data
against the fully-automated processed data.
We would like to show that the true number of articles with a region mention,
is highly correlated to the estimated number of articles from our full pipeline.


First let's take a look at the prediction errors for UN Subregions

```{r fig.align='center', fig.width = 15, fig.height = 15, echo=FALSE, warning=FALSE, message=F}


# join the df to make comparison
bm_loc_df$text = tolower(bm_loc_df$text)
raw_loc_df$text = tolower(raw_loc_df$text)
compare_df = merge(bm_loc_df, raw_loc_df, by=c("file_id", "text"), all.x=T)

# now we only count ONCE per article
compare_df = subset(compare_df, select = -c(text))
compare_df = unique(compare_df)

#compare per country
country_idx = which(colnames(compare_df) == "est_un_subregion")
true_country_idx = which(colnames(compare_df) == "true_un_subregion")
country_match = apply(compare_df, 1,
function(x) x[country_idx] == x[true_country_idx])

compare_df$is_country_correct = country_match

# write out confusion tables

# first need to format the levels
compare_df$est_un_subregion = as.factor(compare_df$est_un_subregion)
compare_df$true_un_subregion = as.factor(compare_df$true_un_subregion)

all_levels = unique(c(levels(compare_df$est_un_subregion),
levels(compare_df$true_un_subregion)))

missing_levels = setdiff(all_levels, levels(compare_df$est_un_subregion))
levels(compare_df$est_un_subregion) =
c(levels(compare_df$est_un_subregion), missing_levels)

missing_levels = setdiff(all_levels, levels(compare_df$true_un_subregion))
levels(compare_df$true_un_subregion) =
c(levels(compare_df$true_un_subregion), missing_levels)

confusion_matrix <- confusionMatrix(compare_df$est_un_subregion,
compare_df$true_un_subregion)
gg_conf = prettyConfused(compare_df$true_un_subregion, compare_df$est_un_subregion, text.scl = 5)
gg_conf = gg_conf + ggtitle(paste("UN Subregion Prediction Kappa:",
round(confusion_matrix$overall['Kappa'], 4))) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))

gg_conf



```

We find that there exist errors in the prediction, but it is not completely off.
We would like to verify that our hand annotation and pipeline results
are at least strongly correlated.


```{r echo=FALSE, out.width="50%", warning=FALSE, message=F}

pred_freq = as.data.frame(table(compare_df$est_country_code))
colnames(pred_freq) = c("country", "Pred_Freq")

true_freq = as.data.frame(table(compare_df$true_country_code))
colnames(true_freq) = c("country", "True_Freq")

freq_df = merge(pred_freq, true_freq, all=T)
freq_df = subset(freq_df, !country %in% c("NOT_COUNTRY", "NOT_FOUND"))

gg_corr_all = ggplot(freq_df, aes(x=Pred_Freq, y=True_Freq, label=country)) +
geom_point() + geom_abline(intercept = 0, slope = 1) +
theme_bw() + geom_text_repel() +
xlab("Predicted Frequency") +
ylab("True Frequency") +
ggtitle("Pred. vs. True Country Frequencies")

gg_corr_subset = ggplot(subset(freq_df, Pred_Freq < 20),
aes(x=Pred_Freq, y=True_Freq, label=country)) +
geom_point() + geom_abline(intercept = 0, slope = 1) +
theme_bw() + geom_text_repel() +
xlab("Predicted Frequency") +
ylab("True Frequency") +
ggtitle("Excluding top 2: Pred. vs. True Country Frequencies")

gg_corr_all
gg_corr_subset

```
Let's look at if subregions is any better/worse:

```{r echo=FALSE, out.width="50%", warning=FALSE, message=F}

pred_freq = as.data.frame(table(compare_df$est_un_subregion))
colnames(pred_freq) = c("un_subregion", "Pred_Freq")

true_freq = as.data.frame(table(compare_df$true_un_subregion))
colnames(true_freq) = c("un_subregion", "True_Freq")

freq_df = merge(pred_freq, true_freq, all=T)
freq_df = subset(freq_df, un_subregion != "NO_EST")
freq_df[is.na(freq_df)] = 0

gg_corr_all = ggplot(freq_df, aes(x=Pred_Freq, y=True_Freq, label=un_subregion)) +
geom_point() + geom_abline(intercept = 0, slope = 1) +
theme_bw() + geom_text_repel() +
xlab("Predicted Frequency") +
ylab("True Frequency") +
ggtitle("Pred. vs. True UN Subregion Frequencies")

gg_corr_subset = ggplot(subset(freq_df, Pred_Freq < 60),
aes(x=Pred_Freq, y=True_Freq, label=un_subregion)) +
geom_point() + geom_abline(intercept = 0, slope = 1) +
theme_bw() + geom_text_repel() +
xlab("Predicted Frequency") +
ylab("True Frequency") +
ggtitle("Excluding top 1: Pred. vs. True UN Subregion Frequencies")

gg_corr_all
gg_corr_subset

```

Now, finally large regions:

```{r echo=FALSE, out.width="50%", warning=FALSE, message=F}

pred_freq = as.data.frame(table(compare_df$est_un_region))
colnames(pred_freq) = c("un_region", "Pred_Freq")

true_freq = as.data.frame(table(compare_df$true_un_region))
colnames(true_freq) = c("un_region", "True_Freq")

freq_df = merge(pred_freq, true_freq, all=T)
freq_df = subset(freq_df, un_region != "NO_EST")
freq_df[is.na(freq_df)] = 0

gg_corr_all = ggplot(freq_df, aes(x=Pred_Freq, y=True_Freq, label=un_region)) +
geom_point() + geom_abline(intercept = 0, slope = 1) +
theme_bw() + geom_text_repel() +
xlab("Predicted Frequency") +
ylab("True Frequency") +
xlim(c(0,110)) + ylim(c(0,110)) +
ggtitle("Pred. vs. True UN Region Frequencies")

gg_corr_subset = ggplot(subset(freq_df, Pred_Freq < 60),
aes(x=Pred_Freq, y=True_Freq, label=un_region)) +
geom_point() + geom_abline(intercept = 0, slope = 1) +
theme_bw() + geom_text_repel() +
xlab("Predicted Frequency") +
ylab("True Frequency") +
xlim(c(0,20)) + ylim(c(0,20)) +
ggtitle("Excluding top Europe+Americas: Pred. vs. True UN Region Frequencies")

gg_corr_all
gg_corr_subset

```
Loading