Skip to content

Commit

Permalink
Lots of work done in this new version. See release notes.
Browse files Browse the repository at this point in the history
  • Loading branch information
PierreMesure committed Feb 11, 2019
1 parent c8db5f0 commit 4af3d5f
Show file tree
Hide file tree
Showing 27 changed files with 895 additions and 337 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
*.db
*.db-journal

*.pdf
__pycache__
env/
.DS_Store
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,6 @@ Write some code to clean the data or to visualise it! 👩🏽‍💻
contact the government to ask them to release it! Explain to them what you could do with it. 👨🏻‍⚕️
- ask the government to reform its [offentlighetsprincip](https://sv.wikipedia.org/wiki/Offentlighetsprincipen)
to require that any piece of public data be available online in a structured format.
Canada, the UK or France are doing it, it's time for Sweden to catch up! 🙋🏻‍♀️
Canada, the UK, Germany or France are doing it, it's time for Sweden to catch up! 🙋🏻‍♀️

And don't hesitate to contact us, we love hearing from opengov enthusiasts! ❤️
109 changes: 109 additions & 0 deletions api/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import graphene
from graphene_sqlalchemy import SQLAlchemyObjectType

from database.answer import Answer as AnswerModel
from database.consultee_list import ConsulteeList as ConsulteeListModel
from database.consultee import Consultee as ConsulteeModel
from database.document import Document as DocumentModel
from database.file import File as FileModel
from database.remiss import Remiss as RemissModel

from service.database import Database

class FileAttribute:
name = graphene.String(description="Name of the file.")
url = graphene.String(description="URL of the file.")


class File(SQLAlchemyObjectType):
class Meta:
model = FileModel


class DocumentAttribute:
remiss_id = graphene.Int(description="Id of the answer's remiss.")
type = graphene.String(description="Type of the document.")
files = graphene.List(File, description="Files of the document.")


class Document(SQLAlchemyObjectType):
class Meta:
model = DocumentModel


class AnswerAttribute:
organisation = graphene.String(
description="Organisation or individual which authored the answer.")
remiss_id = DocumentAttribute.remiss_id
type = DocumentAttribute.type
files = DocumentAttribute.files


class Answer(SQLAlchemyObjectType):
class Meta:
model = AnswerModel


class ConsulteeAttribute:
name = graphene.String(description="Name of the consultee.")


class Consultee(SQLAlchemyObjectType):
class Meta:
model = ConsulteeModel


class ConsulteeListAttribute:
consultee_list = graphene.List(
Consultee,
description="List of all the consultees in the document."
)
remiss_id = DocumentAttribute.remiss_id
type = DocumentAttribute.type
files = DocumentAttribute.files


class ConsulteeList(SQLAlchemyObjectType):
class Meta:
model = ConsulteeListModel


class Remiss(SQLAlchemyObjectType):
class Meta:
model = RemissModel


class Query(graphene.ObjectType):
# Allows sorting over multiple columns, by default over the primary key
answer = graphene.Field(Answer)
answers = graphene.List(Answer)

def resolve_answer(self, *args, **kwargs):
return Database.query(AnswerModel).first()

def resolve_answers(self, *args, **kwargs):
return Database.query(AnswerModel).all()

consultee_list = graphene.Field(ConsulteeList)
consultee_lists = graphene.List(ConsulteeList)

consultee = graphene.Field(Consultee)
consultees = graphene.List(Consultee)

document = graphene.Field(Document)
documents = graphene.List(Document)

file = graphene.Field(File)
files = graphene.List(File)

remiss = graphene.Field(Remiss)
remisser = graphene.List(Remiss)

def resolve_remiss(self, *args, **kwargs):
return RemissModel.query.first()

def resolve_remisser(self, *args, **kwargs):
return RemissModel.query.all()


schema = graphene.Schema(query=Query)
26 changes: 26 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from flask import Flask
from flask_graphql import GraphQLView

from service.database import Database
from api.schema import schema

app = Flask(__name__)
app.debug = True

app.add_url_rule(
'/graphql',
view_func=GraphQLView.as_view(
'graphql',
schema=schema,
graphiql=True
)
)


@app.teardown_appcontext
def shutdown_session(exception=None):
Database.remove()


if __name__ == '__main__':
app.run()
79 changes: 79 additions & 0 deletions build_remissinstans_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import urllib.request
import os

from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFSyntaxError

from database.file import File
from database.document import Document
from database.consultee_list import ConsulteeList
from database.consultee import Consultee
from database.answer import Answer
from database.remiss import Remiss

from service.downloader import Downloader
from service.document_parser import DocumentParser
from service.database import Database
from service.file_manager import FileManager
from io import BytesIO

RESET_DB = False
RESET_FILES = False

if RESET_DB:
Database.delete_all(Consultee)
Database.commit()
print(f'Emptied the consultee table.\n')

saved_consultee_lists = ConsulteeList.query.all()
saved_consultees = Consultee.query.all()
saved_remisser = Remiss.query.all()
saved_documents = Document.query.filter(Document.type == 'consultee_list')

nb_of_consultees = len(saved_consultees)
print(f'Found {nb_of_consultees} consultees in the database.')

for document in saved_documents:

if not RESET_DB and document.consultee_list != []:
print(f'Consultees for remiss {document.remiss_id} already in database.')
continue
elif RESET_DB:
Consultee.query.filter(Consultee.consultee_list_id == document.id).delete()

filepath = f'tmp/{document.remiss_id}/{document.id}.pdf'

if RESET_FILES or not FileManager.filepath_exists(filepath):
try:
f = Downloader.get(document.files[0].url)
except urllib.error.HTTPError:
print(f'404: Remissinstans {document.remiss_id} not found.')

if f is not None:
fp = BytesIO(f)
FileManager.write_to_filepath(filepath, f)

if FileManager.filepath_exists(filepath):
fp = FileManager.get_filepath(filepath)
else:
continue

try:
list = DocumentParser.extract_list(fp)
except (PDFTextExtractionNotAllowed, PDFSyntaxError):
print(f'Document {document.remiss_id} could not be extracted.')
continue

if not list:
print(f'Document {document.remiss_id} could not be extracted.')
continue

document.consultee_list = list

Database.commit()

print(f'Saved {len(list)} organisations for remiss {document.remiss_id}')

fp.close()

Database.close()
20 changes: 20 additions & 0 deletions clean_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from service.database import Database
from service.cleaner import Cleaner
from database.answer import Answer

saved_answers = Answer.query.all()

RESET_DB = False

print('II-2 Cleaning filenames to get organisation name...')
for index, answer in enumerate(saved_answers, start=1):
if index % (len(saved_answers) // 100) == 0:
print(
f'{(index + 1) * 100 // len(saved_answers)} % cleaned'
)

if RESET_DB or answer.organisation == None:
organisation_name = Cleaner.get_organisation_name(answer.files[0].name)
if organisation_name != answer.organisation:
answer.organisation = organisation_name
Database.commit()
17 changes: 17 additions & 0 deletions database/answer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from database.document import Document
from sqlalchemy import ForeignKey, Column, Integer, String
from sqlalchemy.orm import relationship


class Answer(Document):
"""Answer model."""

__tablename__ = 'answer'

id = Column(Integer, ForeignKey('document.id'), primary_key=True)
remiss = relationship('Remiss', back_populates='answers')
organisation = Column(String)

__mapper_args__ = {
'polymorphic_identity': 'answer',
}
19 changes: 19 additions & 0 deletions database/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import scoped_session, sessionmaker
import os


# Create database engine
db_name = 'database.db'
db_path = os.path.join(os.path.dirname(__file__), db_name)
db_uri = 'sqlite:///{}'.format(db_path)
engine = create_engine(db_uri, convert_unicode=True)

# Declarative base model to create database tables and classes
Base = declarative_base()
Base.metadata.bind = engine # Bind engine to metadata of the base class

# Create database session object
db_session = scoped_session(sessionmaker(bind=engine, expire_on_commit=False))
Base.query = db_session.query_property() # Used by graphql to execute queries
15 changes: 15 additions & 0 deletions database/consultee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .base import Base
from sqlalchemy import ForeignKey, Column, Integer, String
from sqlalchemy.orm import relationship


class Consultee(Base):
"""Consultee model."""

__tablename__ = 'consultee'

id = Column(Integer, primary_key=True)
consultee_list_id = Column(Integer, ForeignKey('consultee_list.id'))
consultee_list = relationship('ConsulteeList',
back_populates='consultee_list')
name = Column(String)
17 changes: 17 additions & 0 deletions database/consultee_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from database.document import Document
from sqlalchemy import ForeignKey, Column, Integer
from sqlalchemy.orm import relationship


class ConsulteeList(Document):
"""ConsulteeList model."""

__tablename__ = 'consultee_list'

id = Column(Integer, ForeignKey('document.id'), primary_key=True)
remiss = relationship('Remiss', back_populates='consultees')
consultee_list = relationship('Consultee', back_populates='consultee_list')

__mapper_args__ = {
'polymorphic_identity': 'consultee_list',
}
20 changes: 20 additions & 0 deletions database/content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .base import Base
from sqlalchemy import Column, Integer, String, Date


class Content(Base):
"""Content model."""

__tablename__ = 'content'

id = Column(Integer, primary_key=True)
issuer = Column(String)
published_on = Column(Date)
title = Column(String)
url = Column(String)
type = Column(String)

__mapper_args__ = {
'polymorphic_identity': 'content',
'polymorphic_on': type
}
20 changes: 20 additions & 0 deletions database/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .base import Base
from sqlalchemy import ForeignKey, Column, Integer, String
from sqlalchemy.orm import relationship


class Document(Base):
"""Document model."""

__tablename__ = 'document'

id = Column(Integer, primary_key=True)
remiss_id = Column(Integer, ForeignKey('remiss.id'))
remiss = relationship('Remiss', back_populates='other_documents')
files = relationship('File', back_populates='document')
type = Column(String)

__mapper_args__ = {
'polymorphic_identity': 'document',
'polymorphic_on': type
}
15 changes: 15 additions & 0 deletions database/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .base import Base
from sqlalchemy import ForeignKey, Column, Integer, String
from sqlalchemy.orm import relationship


class File(Base):
"""File model."""

__tablename__ = 'file'

id = Column(Integer, primary_key=True)
document_id = Column(Integer, ForeignKey('document.id'))
document = relationship('Document', back_populates='files')
name = Column(String)
url = Column(String)
Loading

0 comments on commit 4af3d5f

Please sign in to comment.