Skip to content

Commit

Permalink
feat(harvard_merger): add harvard_id field to OpinionCluster
Browse files Browse the repository at this point in the history
Harvard's Caselaw Access Project has been sunset. For projects
which have existing references to CAP cases, there's a need to
identify a CAP case's corresponding CL opinion cluster.

An indexed `harvard_id` column is added to `OpinionCluster`. The
field is also added to the `fields` of `OpinionClusterFilter`.

For migration, this patch builds on work done in freelawproject#4284 and freelawproject#4442
and extends `import_harvard_pdfs` to populate the `harvard_id`
column using CAP crosswalk file.

Fixes: freelawproject#4313
  • Loading branch information
cweider committed Oct 25, 2024
1 parent eb06405 commit cb77066
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 1 deletion.
1 change: 1 addition & 0 deletions cl/search/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class Meta:
"citation_count": INTEGER_LOOKUPS,
"precedential_status": ["exact"],
"date_blocked": DATE_LOOKUPS,
"harvard_id": ["exact"],
"blocked": ["exact"],
}

Expand Down
13 changes: 12 additions & 1 deletion cl/search/management/commands/import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def add_arguments(self, parser):
parser.add_argument(
"--job",
type=str,
choices=["import_pdf"],
choices=["import_pdf", "assign_cap_id"],
default="import_pdf",
help="",
)
Expand Down Expand Up @@ -242,6 +242,17 @@ def process_entry(

except OpinionCluster.DoesNotExist:
logger.info(f"Cluster not found for id: {cl_cluster_id}")

case "assign_cap_id":
try:
cluster = OpinionCluster.objects.get(id=cl_cluster_id)
cluster.harvard_id = cap_case_id
if not self.dry_run:
cluster.save()

except OpinionCluster.DoesNotExist:
logger.info(f"Cluster not found for id: {cl_cluster_id}")

case _:
raise Exception(f"Unknown job {self.job}")

Expand Down
69 changes: 69 additions & 0 deletions cl/search/migrations/0037_add_harvard_id_to_opinioncluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Generated by Django 5.1.2 on 2024-10-25 18:54

import pgtrigger.compiler
import pgtrigger.migrations
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("search", "0036_add_searchquery"),
]

operations = [
pgtrigger.migrations.RemoveTrigger(
model_name="opinioncluster",
name="update_update",
),
pgtrigger.migrations.RemoveTrigger(
model_name="opinioncluster",
name="delete_delete",
),
migrations.AddField(
model_name="opinioncluster",
name="harvard_id",
field=models.CharField(
db_index=True,
default=0,
help_text="The ID of the item in the Caselaw Access Project (Harvard)",
),
),
migrations.AddField(
model_name="opinionclusterevent",
name="harvard_id",
field=models.CharField(
default=0,
help_text="The ID of the item in the Caselaw Access Project (Harvard)",
),
),
pgtrigger.migrations.AddTrigger(
model_name="opinioncluster",
trigger=pgtrigger.compiler.Trigger(
name="update_update",
sql=pgtrigger.compiler.UpsertTriggerSql(
condition='WHEN (OLD."arguments" IS DISTINCT FROM (NEW."arguments") OR OLD."attorneys" IS DISTINCT FROM (NEW."attorneys") OR OLD."blocked" IS DISTINCT FROM (NEW."blocked") OR OLD."case_name" IS DISTINCT FROM (NEW."case_name") OR OLD."case_name_full" IS DISTINCT FROM (NEW."case_name_full") OR OLD."case_name_short" IS DISTINCT FROM (NEW."case_name_short") OR OLD."citation_count" IS DISTINCT FROM (NEW."citation_count") OR OLD."correction" IS DISTINCT FROM (NEW."correction") OR OLD."cross_reference" IS DISTINCT FROM (NEW."cross_reference") OR OLD."date_blocked" IS DISTINCT FROM (NEW."date_blocked") OR OLD."date_filed" IS DISTINCT FROM (NEW."date_filed") OR OLD."date_filed_is_approximate" IS DISTINCT FROM (NEW."date_filed_is_approximate") OR OLD."disposition" IS DISTINCT FROM (NEW."disposition") OR OLD."docket_id" IS DISTINCT FROM (NEW."docket_id") OR OLD."filepath_json_harvard" IS DISTINCT FROM (NEW."filepath_json_harvard") OR OLD."filepath_pdf_harvard" IS DISTINCT FROM (NEW."filepath_pdf_harvard") OR OLD."harvard_id" IS DISTINCT FROM (NEW."harvard_id") OR OLD."headmatter" IS DISTINCT FROM (NEW."headmatter") OR OLD."headnotes" IS DISTINCT FROM (NEW."headnotes") OR OLD."history" IS DISTINCT FROM (NEW."history") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."judges" IS DISTINCT FROM (NEW."judges") OR OLD."nature_of_suit" IS DISTINCT FROM (NEW."nature_of_suit") OR OLD."other_dates" IS DISTINCT FROM (NEW."other_dates") OR OLD."posture" IS DISTINCT FROM (NEW."posture") OR OLD."precedential_status" IS DISTINCT FROM (NEW."precedential_status") OR OLD."procedural_history" IS DISTINCT FROM (NEW."procedural_history") OR OLD."scdb_decision_direction" IS DISTINCT FROM (NEW."scdb_decision_direction") OR OLD."scdb_id" IS DISTINCT FROM (NEW."scdb_id") OR OLD."scdb_votes_majority" IS DISTINCT FROM (NEW."scdb_votes_majority") OR OLD."scdb_votes_minority" IS DISTINCT FROM (NEW."scdb_votes_minority") OR OLD."slug" IS DISTINCT FROM (NEW."slug") OR OLD."source" IS DISTINCT FROM (NEW."source") OR OLD."summary" IS DISTINCT FROM (NEW."summary") OR OLD."syllabus" IS DISTINCT FROM (NEW."syllabus"))',
func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
hash="bc20a56b13c375017e704a6e50efd44e5c060018",
operation="UPDATE",
pgid="pgtrigger_update_update_c83f1",
table="search_opinioncluster",
when="AFTER",
),
),
),
pgtrigger.migrations.AddTrigger(
model_name="opinioncluster",
trigger=pgtrigger.compiler.Trigger(
name="delete_delete",
sql=pgtrigger.compiler.UpsertTriggerSql(
func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
hash="93725d0e8785d341973cd6af46aa9b3e9aca1ec2",
operation="DELETE",
pgid="pgtrigger_delete_delete_a8516",
table="search_opinioncluster",
when="AFTER",
),
),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
BEGIN;
ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
ALTER TABLE "search_opinionclusterevent" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
ALTER TABLE "search_opinionclusterevent" ALTER COLUMN "harvard_id" DROP DEFAULT;
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
COMMIT;
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
BEGIN;
ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
COMMIT;
5 changes: 5 additions & 0 deletions cl/search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2731,6 +2731,11 @@ class OpinionCluster(AbstractDateTimeModel):
storage=IncrementingAWSMediaStorage(),
blank=True,
)
harvard_id = models.CharField(
help_text="The ID of the item in the Caselaw Access Project (Harvard)",
default=0,
db_index=True,
)
arguments = models.TextField(
help_text="The attorney(s) and legal arguments presented as HTML text. "
"This is primarily seen in older opinions and can contain "
Expand Down
65 changes: 65 additions & 0 deletions cl/search/tests/test_import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,68 @@ def test_import_harvard_pdfs(
self.assertEqual(
self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf"
)

@patch("cl.search.management.commands.import_harvard_pdfs.tqdm")
@patch(
"cl.search.management.commands.import_harvard_pdfs.OpinionCluster.objects.get"
)
@patch(
"cl.search.management.commands.import_harvard_pdfs.HarvardPDFStorage"
)
@patch("cl.search.management.commands.import_harvard_pdfs.boto3.client")
@patch("cl.search.management.commands.import_harvard_pdfs.os.listdir")
@patch("cl.search.management.commands.import_harvard_pdfs.os.path.exists")
def test_assign_harvard_id(
self,
mock_exists,
mock_listdir,
mock_boto3_client,
mock_harvard_storage,
mock_opinion_cluster_get,
mock_tqdm,
):
# Setup mocks
mock_listdir.return_value = ["test_crosswalk.json"]
mock_exists.side_effect = lambda path: path in [
"/mocked_path/crosswalk_dir"
]

mock_s3 = MagicMock()
mock_boto3_client.return_value = mock_s3
mock_storage = MagicMock()
mock_harvard_storage.return_value = mock_storage
mock_opinion_cluster_get.return_value = self.cluster
mock_tqdm.side_effect = (
lambda x, *args, **kwargs: x
) # Make tqdm a pass-through function

crosswalk_data = [
{
"cap_case_id": 1,
"cl_cluster_id": self.cluster.id,
"cap_path": "/test/path.json",
}
]

# Mock file operations
m = mock_open(read_data=json.dumps(crosswalk_data))

# Mock crosswalk_dir
crosswalk_dir = "/mocked_path/crosswalk_dir"

# Verify crosswalk_dir exists
self.assertTrue(
os.path.exists(crosswalk_dir),
f"Crosswalk directory does not exist: {crosswalk_dir}",
)

with patch("builtins.open", m):
call_command(
"import_harvard_pdfs",
crosswalk_dir=crosswalk_dir,
job="assign_cap_id",
)

# Verify that the cluster's harvard_id field was updated
self.cluster.refresh_from_db()
self.assertEqual(self.cluster.harvard_id, "1")

0 comments on commit cb77066

Please sign in to comment.