Lots of work done in this new version. See release notes.

DinRiksdag · Feb 11, 2019 · 4af3d5f · 4af3d5f
1 parent c8db5f0
commit 4af3d5f
Show file tree

Hide file tree

Showing 27 changed files with 895 additions and 337 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 *.db
 *.db-journal
-
+*.pdf
 __pycache__
+env/
+.DS_Store
diff --git a/README.md b/README.md
@@ -102,6 +102,6 @@ Write some code to clean the data or to visualise it! 👩🏽‍💻
 contact the government to ask them to release it! Explain to them what you could do with it. 👨🏻‍⚕️
 - ask the government to reform its [offentlighetsprincip](https://sv.wikipedia.org/wiki/Offentlighetsprincipen)
 to require that any piece of public data be available online in a structured format.
-Canada, the UK or France are doing it, it's time for Sweden to catch up! 🙋🏻‍♀️
+Canada, the UK, Germany or France are doing it, it's time for Sweden to catch up! 🙋🏻‍♀️
 
 And don't hesitate to contact us, we love hearing from opengov enthusiasts! ❤️
diff --git a/api/schema.py b/api/schema.py
@@ -0,0 +1,109 @@
+import graphene
+from graphene_sqlalchemy import SQLAlchemyObjectType
+
+from database.answer import Answer as AnswerModel
+from database.consultee_list import ConsulteeList as ConsulteeListModel
+from database.consultee import Consultee as ConsulteeModel
+from database.document import Document as DocumentModel
+from database.file import File as FileModel
+from database.remiss import Remiss as RemissModel
+
+from service.database import Database
+
+class FileAttribute:
+    name = graphene.String(description="Name of the file.")
+    url = graphene.String(description="URL of the file.")
+
+
+class File(SQLAlchemyObjectType):
+    class Meta:
+        model = FileModel
+
+
+class DocumentAttribute:
+    remiss_id = graphene.Int(description="Id of the answer's remiss.")
+    type = graphene.String(description="Type of the document.")
+    files = graphene.List(File, description="Files of the document.")
+
+
+class Document(SQLAlchemyObjectType):
+    class Meta:
+        model = DocumentModel
+
+
+class AnswerAttribute:
+    organisation = graphene.String(
+        description="Organisation or individual which authored the answer.")
+    remiss_id = DocumentAttribute.remiss_id
+    type = DocumentAttribute.type
+    files = DocumentAttribute.files
+
+
+class Answer(SQLAlchemyObjectType):
+    class Meta:
+        model = AnswerModel
+
+
+class ConsulteeAttribute:
+    name = graphene.String(description="Name of the consultee.")
+
+
+class Consultee(SQLAlchemyObjectType):
+    class Meta:
+        model = ConsulteeModel
+
+
+class ConsulteeListAttribute:
+    consultee_list = graphene.List(
+        Consultee,
+        description="List of all the consultees in the document."
+                                  )
+    remiss_id = DocumentAttribute.remiss_id
+    type = DocumentAttribute.type
+    files = DocumentAttribute.files
+
+
+class ConsulteeList(SQLAlchemyObjectType):
+    class Meta:
+        model = ConsulteeListModel
+
+
+class Remiss(SQLAlchemyObjectType):
+    class Meta:
+        model = RemissModel
+
+
+class Query(graphene.ObjectType):
+    # Allows sorting over multiple columns, by default over the primary key
+    answer = graphene.Field(Answer)
+    answers = graphene.List(Answer)
+
+    def resolve_answer(self, *args, **kwargs):
+        return Database.query(AnswerModel).first()
+
+    def resolve_answers(self, *args, **kwargs):
+        return Database.query(AnswerModel).all()
+
+    consultee_list = graphene.Field(ConsulteeList)
+    consultee_lists = graphene.List(ConsulteeList)
+
+    consultee = graphene.Field(Consultee)
+    consultees = graphene.List(Consultee)
+
+    document = graphene.Field(Document)
+    documents = graphene.List(Document)
+
+    file = graphene.Field(File)
+    files = graphene.List(File)
+
+    remiss = graphene.Field(Remiss)
+    remisser = graphene.List(Remiss)
+
+    def resolve_remiss(self, *args, **kwargs):
+        return RemissModel.query.first()
+
+    def resolve_remisser(self, *args, **kwargs):
+        return RemissModel.query.all()
+
+
+schema = graphene.Schema(query=Query)
diff --git a/app.py b/app.py
@@ -0,0 +1,26 @@
+from flask import Flask
+from flask_graphql import GraphQLView
+
+from service.database import Database
+from api.schema import schema
+
+app = Flask(__name__)
+app.debug = True
+
+app.add_url_rule(
+    '/graphql',
+    view_func=GraphQLView.as_view(
+        'graphql',
+        schema=schema,
+        graphiql=True
+    )
+)
+
+
+@app.teardown_appcontext
+def shutdown_session(exception=None):
+    Database.remove()
+
+
+if __name__ == '__main__':
+    app.run()
diff --git a/build_remissinstans_list.py b/build_remissinstans_list.py
@@ -0,0 +1,79 @@
+import urllib.request
+import os
+
+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
+from pdfminer.pdfparser import PDFSyntaxError
+
+from database.file import File
+from database.document import Document
+from database.consultee_list import ConsulteeList
+from database.consultee import Consultee
+from database.answer import Answer
+from database.remiss import Remiss
+
+from service.downloader import Downloader
+from service.document_parser import DocumentParser
+from service.database import Database
+from service.file_manager import FileManager
+from io import BytesIO
+
+RESET_DB = False
+RESET_FILES = False
+
+if RESET_DB:
+    Database.delete_all(Consultee)
+    Database.commit()
+    print(f'Emptied the consultee table.\n')
+
+saved_consultee_lists = ConsulteeList.query.all()
+saved_consultees = Consultee.query.all()
+saved_remisser = Remiss.query.all()
+saved_documents = Document.query.filter(Document.type == 'consultee_list')
+
+nb_of_consultees = len(saved_consultees)
+print(f'Found {nb_of_consultees} consultees in the database.')
+
+for document in saved_documents:
+
+    if not RESET_DB and document.consultee_list != []:
+        print(f'Consultees for remiss {document.remiss_id} already in database.')
+        continue
+    elif RESET_DB:
+        Consultee.query.filter(Consultee.consultee_list_id == document.id).delete()
+
+    filepath = f'tmp/{document.remiss_id}/{document.id}.pdf'
+
+    if RESET_FILES or not FileManager.filepath_exists(filepath):
+        try:
+            f = Downloader.get(document.files[0].url)
+        except urllib.error.HTTPError:
+            print(f'404: Remissinstans {document.remiss_id} not found.')
+
+        if f is not None:
+            fp = BytesIO(f)
+            FileManager.write_to_filepath(filepath, f)
+
+    if FileManager.filepath_exists(filepath):
+        fp = FileManager.get_filepath(filepath)
+    else:
+        continue
+
+    try:
+        list = DocumentParser.extract_list(fp)
+    except (PDFTextExtractionNotAllowed, PDFSyntaxError):
+        print(f'Document {document.remiss_id} could not be extracted.')
+        continue
+
+    if not list:
+        print(f'Document {document.remiss_id} could not be extracted.')
+        continue
+
+    document.consultee_list = list
+
+    Database.commit()
+
+    print(f'Saved {len(list)} organisations for remiss {document.remiss_id}')
+
+    fp.close()
+
+Database.close()
diff --git a/clean_data.py b/clean_data.py
@@ -0,0 +1,20 @@
+from service.database import Database
+from service.cleaner import Cleaner
+from database.answer import Answer
+
+saved_answers = Answer.query.all()
+
+RESET_DB = False
+
+print('II-2 Cleaning filenames to get organisation name...')
+for index, answer in enumerate(saved_answers, start=1):
+    if index % (len(saved_answers) // 100) == 0:
+        print(
+            f'{(index + 1) * 100 // len(saved_answers)} % cleaned'
+            )
+
+    if RESET_DB or answer.organisation == None:
+        organisation_name = Cleaner.get_organisation_name(answer.files[0].name)
+        if organisation_name != answer.organisation:
+            answer.organisation = organisation_name
+            Database.commit()
diff --git a/database/answer.py b/database/answer.py
@@ -0,0 +1,17 @@
+from database.document import Document
+from sqlalchemy import ForeignKey, Column, Integer, String
+from sqlalchemy.orm import relationship
+
+
+class Answer(Document):
+    """Answer model."""
+
+    __tablename__ = 'answer'
+
+    id = Column(Integer, ForeignKey('document.id'), primary_key=True)
+    remiss = relationship('Remiss', back_populates='answers')
+    organisation = Column(String)
+
+    __mapper_args__ = {
+        'polymorphic_identity': 'answer',
+    }
diff --git a/database/base.py b/database/base.py
@@ -0,0 +1,19 @@
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import scoped_session, sessionmaker
+import os
+
+
+# Create database engine
+db_name = 'database.db'
+db_path = os.path.join(os.path.dirname(__file__), db_name)
+db_uri = 'sqlite:///{}'.format(db_path)
+engine = create_engine(db_uri, convert_unicode=True)
+
+# Declarative base model to create database tables and classes
+Base = declarative_base()
+Base.metadata.bind = engine  # Bind engine to metadata of the base class
+
+# Create database session object
+db_session = scoped_session(sessionmaker(bind=engine, expire_on_commit=False))
+Base.query = db_session.query_property()  # Used by graphql to execute queries
diff --git a/database/consultee.py b/database/consultee.py
@@ -0,0 +1,15 @@
+from .base import Base
+from sqlalchemy import ForeignKey, Column, Integer, String
+from sqlalchemy.orm import relationship
+
+
+class Consultee(Base):
+    """Consultee model."""
+
+    __tablename__ = 'consultee'
+
+    id = Column(Integer, primary_key=True)
+    consultee_list_id = Column(Integer, ForeignKey('consultee_list.id'))
+    consultee_list = relationship('ConsulteeList',
+                                  back_populates='consultee_list')
+    name = Column(String)
diff --git a/database/consultee_list.py b/database/consultee_list.py
@@ -0,0 +1,17 @@
+from database.document import Document
+from sqlalchemy import ForeignKey, Column, Integer
+from sqlalchemy.orm import relationship
+
+
+class ConsulteeList(Document):
+    """ConsulteeList model."""
+
+    __tablename__ = 'consultee_list'
+
+    id = Column(Integer, ForeignKey('document.id'), primary_key=True)
+    remiss = relationship('Remiss', back_populates='consultees')
+    consultee_list = relationship('Consultee', back_populates='consultee_list')
+
+    __mapper_args__ = {
+        'polymorphic_identity': 'consultee_list',
+    }
diff --git a/database/content.py b/database/content.py
@@ -0,0 +1,20 @@
+from .base import Base
+from sqlalchemy import Column, Integer, String, Date
+
+
+class Content(Base):
+    """Content model."""
+
+    __tablename__ = 'content'
+
+    id = Column(Integer, primary_key=True)
+    issuer = Column(String)
+    published_on = Column(Date)
+    title = Column(String)
+    url = Column(String)
+    type = Column(String)
+
+    __mapper_args__ = {
+        'polymorphic_identity': 'content',
+        'polymorphic_on': type
+    }
diff --git a/database/document.py b/database/document.py
@@ -0,0 +1,20 @@
+from .base import Base
+from sqlalchemy import ForeignKey, Column, Integer, String
+from sqlalchemy.orm import relationship
+
+
+class Document(Base):
+    """Document model."""
+
+    __tablename__ = 'document'
+
+    id = Column(Integer, primary_key=True)
+    remiss_id = Column(Integer, ForeignKey('remiss.id'))
+    remiss = relationship('Remiss', back_populates='other_documents')
+    files = relationship('File', back_populates='document')
+    type = Column(String)
+
+    __mapper_args__ = {
+        'polymorphic_identity': 'document',
+        'polymorphic_on': type
+    }
diff --git a/database/file.py b/database/file.py
@@ -0,0 +1,15 @@
+from .base import Base
+from sqlalchemy import ForeignKey, Column, Integer, String
+from sqlalchemy.orm import relationship
+
+
+class File(Base):
+    """File model."""
+
+    __tablename__ = 'file'
+
+    id = Column(Integer, primary_key=True)
+    document_id = Column(Integer, ForeignKey('document.id'))
+    document = relationship('Document', back_populates='files')
+    name = Column(String)
+    url = Column(String)