From 247ea5ddd0341e277abf3b32dacf47494a071a10 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 28 Jan 2023 23:56:23 -0500 Subject: [PATCH 01/82] UP009 --- bdfr/__init__.py | 1 - bdfr/__main__.py | 1 - bdfr/archive_entry/__init__.py | 1 - bdfr/archive_entry/base_archive_entry.py | 1 - bdfr/archive_entry/comment_archive_entry.py | 1 - bdfr/archive_entry/submission_archive_entry.py | 1 - bdfr/archiver.py | 1 - bdfr/cloner.py | 1 - bdfr/completion.py | 1 - bdfr/configuration.py | 1 - bdfr/connector.py | 1 - bdfr/download_filter.py | 1 - bdfr/downloader.py | 1 - bdfr/exceptions.py | 1 - bdfr/file_name_formatter.py | 1 - bdfr/oauth2.py | 1 - bdfr/resource.py | 1 - bdfr/site_authenticator.py | 1 - bdfr/site_downloaders/__init__.py | 1 - bdfr/site_downloaders/base_downloader.py | 1 - bdfr/site_downloaders/delay_for_reddit.py | 1 - bdfr/site_downloaders/direct.py | 1 - bdfr/site_downloaders/download_factory.py | 1 - bdfr/site_downloaders/erome.py | 1 - bdfr/site_downloaders/fallback_downloaders/__init__.py | 1 - .../site_downloaders/fallback_downloaders/fallback_downloader.py | 1 - bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py | 1 - bdfr/site_downloaders/gallery.py | 1 - bdfr/site_downloaders/gfycat.py | 1 - bdfr/site_downloaders/imgur.py | 1 - bdfr/site_downloaders/pornhub.py | 1 - bdfr/site_downloaders/redgifs.py | 1 - bdfr/site_downloaders/self_post.py | 1 - bdfr/site_downloaders/vidble.py | 1 - bdfr/site_downloaders/vreddit.py | 1 - bdfr/site_downloaders/youtube.py | 1 - tests/__init__.py | 1 - tests/archive_entry/__init__.py | 1 - tests/archive_entry/test_comment_archive_entry.py | 1 - tests/archive_entry/test_submission_archive_entry.py | 1 - tests/conftest.py | 1 - tests/integration_tests/__init__.py | 1 - tests/integration_tests/test_archive_integration.py | 1 - tests/integration_tests/test_clone_integration.py | 1 - tests/integration_tests/test_download_integration.py | 1 - tests/site_downloaders/__init__.py | 1 - tests/site_downloaders/fallback_downloaders/__init__.py | 1 - .../site_downloaders/fallback_downloaders/test_ytdlp_fallback.py | 1 - tests/site_downloaders/test_delay_for_reddit.py | 1 - tests/site_downloaders/test_direct.py | 1 - tests/site_downloaders/test_download_factory.py | 1 - tests/site_downloaders/test_erome.py | 1 - tests/site_downloaders/test_gallery.py | 1 - tests/site_downloaders/test_gfycat.py | 1 - tests/site_downloaders/test_imgur.py | 1 - tests/site_downloaders/test_pornhub.py | 1 - tests/site_downloaders/test_redgifs.py | 1 - tests/site_downloaders/test_self_post.py | 1 - tests/site_downloaders/test_vidble.py | 1 - tests/site_downloaders/test_vreddit.py | 1 - tests/site_downloaders/test_youtube.py | 1 - tests/test_archiver.py | 1 - tests/test_completion.py | 1 - tests/test_configuration.py | 1 - tests/test_connector.py | 1 - tests/test_download_filter.py | 1 - tests/test_downloader.py | 1 - tests/test_file_name_formatter.py | 1 - tests/test_oauth2.py | 1 - tests/test_resource.py | 1 - 70 files changed, 70 deletions(-) diff --git a/bdfr/__init__.py b/bdfr/__init__.py index 6bcee531..0576f7d5 100644 --- a/bdfr/__init__.py +++ b/bdfr/__init__.py @@ -1,4 +1,3 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- __version__ = "2.6.2" diff --git a/bdfr/__main__.py b/bdfr/__main__.py index dadba517..4cdf6d7f 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging import sys diff --git a/bdfr/archive_entry/__init__.py b/bdfr/archive_entry/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/bdfr/archive_entry/__init__.py +++ b/bdfr/archive_entry/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/bdfr/archive_entry/base_archive_entry.py b/bdfr/archive_entry/base_archive_entry.py index 3dea5e49..f48662cb 100644 --- a/bdfr/archive_entry/base_archive_entry.py +++ b/bdfr/archive_entry/base_archive_entry.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from abc import ABC, abstractmethod from typing import Union diff --git a/bdfr/archive_entry/comment_archive_entry.py b/bdfr/archive_entry/comment_archive_entry.py index cc59373b..2c991b31 100644 --- a/bdfr/archive_entry/comment_archive_entry.py +++ b/bdfr/archive_entry/comment_archive_entry.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging diff --git a/bdfr/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py index 38f1d347..e5560352 100644 --- a/bdfr/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 52b4649a..60e3a88b 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import json import logging diff --git a/bdfr/cloner.py b/bdfr/cloner.py index df71c286..87fd9076 100644 --- a/bdfr/cloner.py +++ b/bdfr/cloner.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from collections.abc import Iterable diff --git a/bdfr/completion.py b/bdfr/completion.py index 8ec4e2cb..d9f82261 100644 --- a/bdfr/completion.py +++ b/bdfr/completion.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import subprocess from os import environ diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 05fc27e8..32d02d07 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from argparse import Namespace diff --git a/bdfr/connector.py b/bdfr/connector.py index 77a4a71a..a246c5df 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import configparser import importlib.resources diff --git a/bdfr/download_filter.py b/bdfr/download_filter.py index 0def3169..00be7f6d 100644 --- a/bdfr/download_filter.py +++ b/bdfr/download_filter.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging import re diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 20984e69..38b600b6 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import hashlib import logging.handlers diff --git a/bdfr/exceptions.py b/bdfr/exceptions.py index e7e44153..39b1b304 100644 --- a/bdfr/exceptions.py +++ b/bdfr/exceptions.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- class BulkDownloaderException(Exception): diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index dd04fad9..7b6d236d 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import datetime import logging diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index ead05534..2da3c843 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import configparser import logging diff --git a/bdfr/resource.py b/bdfr/resource.py index 37fc5214..578e030a 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import hashlib import logging diff --git a/bdfr/site_authenticator.py b/bdfr/site_authenticator.py index 08b98e0d..5e177c93 100644 --- a/bdfr/site_authenticator.py +++ b/bdfr/site_authenticator.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import configparser diff --git a/bdfr/site_downloaders/__init__.py b/bdfr/site_downloaders/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/bdfr/site_downloaders/__init__.py +++ b/bdfr/site_downloaders/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/bdfr/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py index e4ac111c..341aab5c 100644 --- a/bdfr/site_downloaders/base_downloader.py +++ b/bdfr/site_downloaders/base_downloader.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from abc import ABC, abstractmethod diff --git a/bdfr/site_downloaders/delay_for_reddit.py b/bdfr/site_downloaders/delay_for_reddit.py index 40a7f9b9..33807316 100644 --- a/bdfr/site_downloaders/delay_for_reddit.py +++ b/bdfr/site_downloaders/delay_for_reddit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from typing import Optional diff --git a/bdfr/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py index 061ad7f2..4a6ac92e 100644 --- a/bdfr/site_downloaders/direct.py +++ b/bdfr/site_downloaders/direct.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from typing import Optional diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 9006681c..37128b88 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import re import urllib.parse diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index 894e470e..69799dbc 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging import re diff --git a/bdfr/site_downloaders/fallback_downloaders/__init__.py b/bdfr/site_downloaders/fallback_downloaders/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/bdfr/site_downloaders/fallback_downloaders/__init__.py +++ b/bdfr/site_downloaders/fallback_downloaders/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py b/bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py index 124724a2..64f662e8 100644 --- a/bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py +++ b/bdfr/site_downloaders/fallback_downloaders/fallback_downloader.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from abc import ABC, abstractmethod diff --git a/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py index 41f84741..8c7fa9ff 100644 --- a/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from typing import Optional diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 6f004104..278932f0 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from typing import Optional diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index d7c60ca6..7862d338 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import json import re diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index bfcecc09..65037eab 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import json import re diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py index 8ce4492f..b12db8ee 100644 --- a/bdfr/site_downloaders/pornhub.py +++ b/bdfr/site_downloaders/pornhub.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from typing import Optional diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 9c469bc5..c063e052 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import json import re diff --git a/bdfr/site_downloaders/self_post.py b/bdfr/site_downloaders/self_post.py index 5719e59b..1b76b922 100644 --- a/bdfr/site_downloaders/self_post.py +++ b/bdfr/site_downloaders/self_post.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from typing import Optional diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py index aa1e9494..c63ff3fa 100644 --- a/bdfr/site_downloaders/vidble.py +++ b/bdfr/site_downloaders/vidble.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import itertools import logging diff --git a/bdfr/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py index 48f5ba1f..04cfed1d 100644 --- a/bdfr/site_downloaders/vreddit.py +++ b/bdfr/site_downloaders/vreddit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging from typing import Optional diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 306d2e1d..71d7be00 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging import tempfile diff --git a/tests/__init__.py b/tests/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/tests/archive_entry/__init__.py b/tests/archive_entry/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/tests/archive_entry/__init__.py +++ b/tests/archive_entry/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/tests/archive_entry/test_comment_archive_entry.py b/tests/archive_entry/test_comment_archive_entry.py index 1895a894..a015138a 100644 --- a/tests/archive_entry/test_comment_archive_entry.py +++ b/tests/archive_entry/test_comment_archive_entry.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import praw import pytest diff --git a/tests/archive_entry/test_submission_archive_entry.py b/tests/archive_entry/test_submission_archive_entry.py index 8b83f1de..6cea3f8d 100644 --- a/tests/archive_entry/test_submission_archive_entry.py +++ b/tests/archive_entry/test_submission_archive_entry.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import praw import pytest diff --git a/tests/conftest.py b/tests/conftest.py index 77a26fb0..58565363 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import configparser import socket diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/tests/integration_tests/__init__.py +++ b/tests/integration_tests/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 42689a89..329b9a17 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import re import shutil diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index 60e40120..bc250e21 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import shutil from pathlib import Path diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 5d7238aa..f278d011 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import shutil from pathlib import Path diff --git a/tests/site_downloaders/__init__.py b/tests/site_downloaders/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/tests/site_downloaders/__init__.py +++ b/tests/site_downloaders/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/tests/site_downloaders/fallback_downloaders/__init__.py b/tests/site_downloaders/fallback_downloaders/__init__.py index 56fafa58..e5a0d9b4 100644 --- a/tests/site_downloaders/fallback_downloaders/__init__.py +++ b/tests/site_downloaders/fallback_downloaders/__init__.py @@ -1,2 +1 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index b7355390..9823d081 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import MagicMock diff --git a/tests/site_downloaders/test_delay_for_reddit.py b/tests/site_downloaders/test_delay_for_reddit.py index 045c0225..c21784bd 100644 --- a/tests/site_downloaders/test_delay_for_reddit.py +++ b/tests/site_downloaders/test_delay_for_reddit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import Mock diff --git a/tests/site_downloaders/test_direct.py b/tests/site_downloaders/test_direct.py index 14190eef..e0215fa9 100644 --- a/tests/site_downloaders/test_direct.py +++ b/tests/site_downloaders/test_direct.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import Mock diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index f95e609b..1941df9f 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import praw import pytest diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index ce32e883..9de9a4cc 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import re from unittest.mock import MagicMock diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index c3cc86f7..cf0d3711 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import praw import pytest diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 0cfb36f4..c4731246 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import Mock diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 744488bb..67c162a1 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import Mock diff --git a/tests/site_downloaders/test_pornhub.py b/tests/site_downloaders/test_pornhub.py index d9971cb2..2ace05d5 100644 --- a/tests/site_downloaders/test_pornhub.py +++ b/tests/site_downloaders/test_pornhub.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import MagicMock diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 9d1a7f54..524e6ae6 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import re from unittest.mock import Mock diff --git a/tests/site_downloaders/test_self_post.py b/tests/site_downloaders/test_self_post.py index 9574b3cc..32addde6 100644 --- a/tests/site_downloaders/test_self_post.py +++ b/tests/site_downloaders/test_self_post.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import praw import pytest diff --git a/tests/site_downloaders/test_vidble.py b/tests/site_downloaders/test_vidble.py index 41398e77..29e5dd7c 100644 --- a/tests/site_downloaders/test_vidble.py +++ b/tests/site_downloaders/test_vidble.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import Mock diff --git a/tests/site_downloaders/test_vreddit.py b/tests/site_downloaders/test_vreddit.py index d5cc1213..246734c6 100644 --- a/tests/site_downloaders/test_vreddit.py +++ b/tests/site_downloaders/test_vreddit.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import MagicMock diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index 3100215d..bf832bb2 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import MagicMock diff --git a/tests/test_archiver.py b/tests/test_archiver.py index cdd12d05..c8e90762 100644 --- a/tests/test_archiver.py +++ b/tests/test_archiver.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from pathlib import Path from unittest.mock import MagicMock diff --git a/tests/test_completion.py b/tests/test_completion.py index e29682a9..0bb57367 100644 --- a/tests/test_completion.py +++ b/tests/test_completion.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import sys from pathlib import Path diff --git a/tests/test_configuration.py b/tests/test_configuration.py index e7999b3b..b071a83a 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import MagicMock diff --git a/tests/test_connector.py b/tests/test_connector.py index 9eabac76..832d2fe6 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from collections.abc import Iterator from datetime import datetime, timedelta diff --git a/tests/test_download_filter.py b/tests/test_download_filter.py index 6062dc3e..42e739f7 100644 --- a/tests/test_download_filter.py +++ b/tests/test_download_filter.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import MagicMock diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ebf82180..2b17eb02 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import logging import re from pathlib import Path diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index f4564156..b6c1aade 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import platform import sys diff --git a/tests/test_oauth2.py b/tests/test_oauth2.py index 123f7500..14b5cb0c 100644 --- a/tests/test_oauth2.py +++ b/tests/test_oauth2.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import configparser from pathlib import Path diff --git a/tests/test_resource.py b/tests/test_resource.py index e17d16a4..c672ff26 100644 --- a/tests/test_resource.py +++ b/tests/test_resource.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- from unittest.mock import MagicMock From 95c8c72271553c1b8eb280c473fbe3b03be07436 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 28 Jan 2023 23:58:36 -0500 Subject: [PATCH 02/82] UP008 --- bdfr/archive_entry/comment_archive_entry.py | 2 +- bdfr/archive_entry/submission_archive_entry.py | 2 +- bdfr/archiver.py | 4 ++-- bdfr/cloner.py | 2 +- bdfr/configuration.py | 2 +- bdfr/downloader.py | 2 +- bdfr/oauth2.py | 2 +- bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bdfr/archive_entry/comment_archive_entry.py b/bdfr/archive_entry/comment_archive_entry.py index 2c991b31..3ee53475 100644 --- a/bdfr/archive_entry/comment_archive_entry.py +++ b/bdfr/archive_entry/comment_archive_entry.py @@ -11,7 +11,7 @@ class CommentArchiveEntry(BaseArchiveEntry): def __init__(self, comment: praw.models.Comment): - super(CommentArchiveEntry, self).__init__(comment) + super().__init__(comment) def compile(self) -> dict: self.source.refresh() diff --git a/bdfr/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py index e5560352..2a3fac5b 100644 --- a/bdfr/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -11,7 +11,7 @@ class SubmissionArchiveEntry(BaseArchiveEntry): def __init__(self, submission: praw.models.Submission): - super(SubmissionArchiveEntry, self).__init__(submission) + super().__init__(submission) def compile(self) -> dict: comments = self._get_comments() diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 60e3a88b..dd4b06eb 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -26,7 +26,7 @@ class Archiver(RedditConnector): def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()): - super(Archiver, self).__init__(args, logging_handlers) + super().__init__(args, logging_handlers) def download(self): for generator in self.reddit_lists: @@ -65,7 +65,7 @@ def get_submissions_from_link(self) -> list[list[praw.models.Submission]]: return [supplied_submissions] def get_user_data(self) -> list[Iterator]: - results = super(Archiver, self).get_user_data() + results = super().get_user_data() if self.args.user and self.args.all_comments: sort = self.determine_sort_function() for user in self.args.user: diff --git a/bdfr/cloner.py b/bdfr/cloner.py index 87fd9076..758e5c89 100644 --- a/bdfr/cloner.py +++ b/bdfr/cloner.py @@ -15,7 +15,7 @@ class RedditCloner(RedditDownloader, Archiver): def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()): - super(RedditCloner, self).__init__(args, logging_handlers) + super().__init__(args, logging_handlers) def download(self): for generator in self.reddit_lists: diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 32d02d07..4aba15f3 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -13,7 +13,7 @@ class Configuration(Namespace): def __init__(self): - super(Configuration, self).__init__() + super().__init__() self.authenticate = False self.config = None self.opts: Optional[str] = None diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 38b600b6..95143be7 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -37,7 +37,7 @@ def _calc_hash(existing_file: Path): class RedditDownloader(RedditConnector): def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()): - super(RedditDownloader, self).__init__(args, logging_handlers) + super().__init__(args, logging_handlers) if self.args.search_existing: self.master_hash_list = self.scan_existing_files(self.download_directory) diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 2da3c843..f7fa01e1 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -88,7 +88,7 @@ def send_message(client: socket.socket, message: str = ""): class OAuth2TokenManager(praw.reddit.BaseTokenManager): def __init__(self, config: configparser.ConfigParser, config_location: Path): - super(OAuth2TokenManager, self).__init__() + super().__init__() self.config = config self.config_location = config_location diff --git a/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py index 8c7fa9ff..86c2481d 100644 --- a/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py @@ -16,7 +16,7 @@ class YtdlpFallback(BaseFallbackDownloader, Youtube): def __init__(self, post: Submission): - super(YtdlpFallback, self).__init__(post) + super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: out = Resource( From 086f4090d4e4bd79811f5e776445379c0915df03 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 28 Jan 2023 23:59:53 -0500 Subject: [PATCH 03/82] UP034 --- tests/integration_tests/test_download_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index f278d011..e6545054 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -217,7 +217,7 @@ def test_cli_download_download_filters(test_args: list[str], tmp_path: Path): test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert any((string in result.output for string in ("Download filter removed ", "filtered due to URL"))) + assert any(string in result.output for string in ("Download filter removed ", "filtered due to URL")) @pytest.mark.online From 730856934b0cd1ecae64441152eb34ce20308c01 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Wed, 1 Feb 2023 14:18:20 -0500 Subject: [PATCH 04/82] Update unsaveposts.py Make some updates to the unsaveposts script and updated flake8 exclude now that there is a python script in the scripts directory. Also added the scripts directory to actions test ignore as any changes in there shouldn't have any affect on the tests that are performed. --- .github/workflows/test.yml | 2 ++ pyproject.toml | 2 +- scripts/unsaveposts.py | 31 +++++++++++++++++++------------ 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 927d70ad..89e99612 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,12 +7,14 @@ on: - "**.md" - ".markdown_style.rb" - ".mdlrc" + - "scripts/" pull_request: branches: [ master, development ] paths-ignore: - "**.md" - ".markdown_style.rb" - ".mdlrc" + - "scripts/" jobs: test: diff --git a/pyproject.toml b/pyproject.toml index c88008d0..dc265b52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ bdfr-download = "bdfr.__main__:cli_download" line-length = 120 [tool.flake8] -exclude = ["scripts"] +exclude = ["scripts/tests"] max-line-length = 120 show-source = true statistics = true diff --git a/scripts/unsaveposts.py b/scripts/unsaveposts.py index c332520a..a0fe9efd 100644 --- a/scripts/unsaveposts.py +++ b/scripts/unsaveposts.py @@ -1,6 +1,6 @@ -#! /usr/bin/env python3.9 -''' -This script takes a list of submission IDs from a file named "successfulids" created with the +#!/usr/bin/env python3 +""" +This script takes a list of submission IDs from a file named "successfulids" created with the "extract_successful_ids.sh" script and unsaves them from your account. To make it work you must fill in the username and password fields below. Make sure you keep the quotes around the fields. You'll need to make a "user script" in your reddit profile to run this. @@ -14,12 +14,18 @@ It'll look like a bunch of random characters like this: pspYLwDoci9z_A The client secret is the longer string next to "secret". Replace those two fields below. Again keep the quotes around the fields. -''' +""" -import praw +from pathlib import Path try: - r= praw.Reddit( + import praw + import prawcore.exceptions +except ImportError: + print("Please install PRAW") + +try: + reddit = praw.Reddit( client_id="CLIENTID", client_secret="CLIENTSECRET", password="USERPASSWORD", @@ -27,14 +33,15 @@ username="USERNAME", ) - with open("successfulids", "r") as f: - for item in f: - r.submission(id = item.strip()).unsave() + with Path("successfulids").open() as id_file: + for item in id_file: + reddit.submission(id=item.strip()).unsave() -except: - print("Something went wrong. Did you install PRAW? Did you change the user login fields?") +except FileNotFoundError: + print("ID file not found") +except prawcore.exceptions.ResponseException: + print("Something went wrong. Did you change the user login fields?") else: print("Done! Thanks for playing!") - From afd2f88f9165f9ea1076e0871ff6227a8d893bcd Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Wed, 1 Feb 2023 14:36:23 -0500 Subject: [PATCH 05/82] Update test_direct.py If this changes again another link should probably be found as this is only a few days old. --- tests/site_downloaders/test_direct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_direct.py b/tests/site_downloaders/test_direct.py index c4279cdc..42e16237 100644 --- a/tests/site_downloaders/test_direct.py +++ b/tests/site_downloaders/test_direct.py @@ -16,7 +16,7 @@ ("https://i.redd.it/q6ebualjxzea1.jpg", "6ec154859c777cb401132bb991cb3635"), ( "https://file-examples.com/wp-content/uploads/2017/11/file_example_MP3_700KB.mp3", - "3caa342e241ddb7d76fd24a834094101", + "35257826e20227a8a57d0e5a410e03c7", ), ), ) From 07e38a77098bf4533e0f18788f4dcc0e68d42797 Mon Sep 17 00:00:00 2001 From: Bunny <73196784+bunny-foofoo@users.noreply.github.com> Date: Wed, 1 Feb 2023 19:26:16 -0800 Subject: [PATCH 06/82] fix #753 --- bdfr/file_name_formatter.py | 3 +++ tests/test_file_name_formatter.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index dd04fad9..cb779dcb 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -154,6 +154,7 @@ def limit_file_name_length(self, filename: str, ending: str, root: Path) -> Path max_path_length = max_path - len(ending) - len(str(root)) - 1 out = Path(root, filename + ending) + safe_ending = re.match(r".*\..*", ending) while any( [ len(filename) > max_file_part_length_chars, @@ -162,6 +163,8 @@ def limit_file_name_length(self, filename: str, ending: str, root: Path) -> Path ] ): filename = filename[:-1] + if not safe_ending and filename[-1] != ".": + filename = filename[:-1] + "." out = Path(root, filename + ending) return out diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index f4564156..32c44493 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -519,3 +519,19 @@ def test_name_submission( results = test_formatter.format_resource_paths(test_resources, Path()) results = set([r[0].name for r in results]) assert results == expected_names + + +@pytest.mark.parametrize( + ("test_filename", "test_ending", "expected_end"), + ( + ("A" * 300 + ".", "_1.mp4", "A_1.mp4"), + ("A" * 300 + ".", ".mp4", "A.mp4"), + ("A" * 300 + ".", "mp4", "A.mp4"), + ), +) +def test_shortened_file_name_ending( + test_filename: str, test_ending: str, expected_end: str, test_formatter: FileNameFormatter +): + result = test_formatter.limit_file_name_length(test_filename, test_ending, Path(".")) + assert result.name.endswith(expected_end) + assert len(str(result)) <= FileNameFormatter.find_max_path_length() From a535fee574e4fab55e1618987d263e43ccb89b00 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 2 Feb 2023 11:50:47 -0500 Subject: [PATCH 07/82] Black update Black version 23.1.0 updates --- bdfr/connector.py | 1 - bdfr/downloader.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 77a4a71a..3bb66861 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -65,7 +65,6 @@ def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handl self.reddit_lists = self.retrieve_reddit_lists() def _setup_internal_objects(self): - self.parse_disabled_modules() self.download_filter = self.create_download_filter() diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 20984e69..84cae379 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -156,7 +156,7 @@ def _download_submission(self, submission: praw.models.Submission): @staticmethod def scan_existing_files(directory: Path) -> dict[str, Path]: files = [] - for (dirpath, _dirnames, filenames) in os.walk(directory): + for dirpath, _dirnames, filenames in os.walk(directory): files.extend([Path(dirpath, file) for file in filenames]) logger.info(f"Calculating hashes for {len(files)} files") From 4a91ff6293eed5aeac65616134f0df4a5f8d0224 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 3 Feb 2023 23:39:25 -0500 Subject: [PATCH 08/82] Version logged Adds to end log message to include type of run and version of BDFR REF: https://github.com/aliparlakci/bulk-downloader-for-reddit/issues/764#issuecomment-1414552069_ --- bdfr/__main__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index dadba517..9ded38d7 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -117,10 +117,10 @@ def cli_download(context: click.Context, **_): reddit_downloader = RedditDownloader(config, [stream]) reddit_downloader.download() except Exception: - logger.exception("Downloader exited unexpectedly") + logger.exception(f"Downloader exited unexpectedly - BDFR Downloader v{__version__}") raise else: - logger.info("Program complete") + logger.info(f"Program complete - BDFR Downloader v{__version__}") @cli.command("archive") @@ -138,10 +138,10 @@ def cli_archive(context: click.Context, **_): reddit_archiver = Archiver(config, [stream]) reddit_archiver.download() except Exception: - logger.exception("Archiver exited unexpectedly") + logger.exception(f"Archiver exited unexpectedly - BDFR Archiver v{__version__}") raise else: - logger.info("Program complete") + logger.info(f"Program complete - BDFR Archiver v{__version__}") @cli.command("clone") @@ -160,10 +160,10 @@ def cli_clone(context: click.Context, **_): reddit_scraper = RedditCloner(config, [stream]) reddit_scraper.download() except Exception: - logger.exception("Scraper exited unexpectedly") + logger.exception("Scraper exited unexpectedly - BDFR Scraper v{__version__}") raise else: - logger.info("Program complete") + logger.info("Program complete - BDFR Cloner v{__version__}") @cli.command("completion") From 1895d2f22aba01524af4144a1eda6053fb16d86f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 6 Feb 2023 09:53:13 +1000 Subject: [PATCH 09/82] Add warning for --search-existing --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5a927bf1..0af7b162 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,7 @@ The following options apply only to the `download` command. This command downloa - `--search-existing` - This will make the BDFR compile the hashes for every file in `directory` - The hashes are used to remove duplicates if `--no-dupes` is supplied or make hard links if `--make-hard-links` is supplied + - **The use of this option is highly discouraged due to inefficiency** - `--file-scheme` - Sets the scheme for files - Default is `{REDDITOR}_{TITLE}_{POSTID}` From 4e15af637fed8485f58e7da958cb510ae1b0e810 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Mon, 6 Feb 2023 09:54:52 +1000 Subject: [PATCH 10/82] Make option descriptions clearer --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0af7b162..37eaff46 100644 --- a/README.md +++ b/README.md @@ -233,11 +233,11 @@ The following options apply only to the `download` command. This command downloa - The default is 120 seconds - See [Rate Limiting](#rate-limiting) for details - `--no-dupes` - - This flag will not redownload files if they were already downloaded in the current run + - This flag will skip writing a file to disk if that file was already downloaded in the current run - This is calculated by MD5 hash - `--search-existing` - This will make the BDFR compile the hashes for every file in `directory` - - The hashes are used to remove duplicates if `--no-dupes` is supplied or make hard links if `--make-hard-links` is supplied + - The hashes are used to skip duplicate files if `--no-dupes` is supplied or make hard links if `--make-hard-links` is supplied - **The use of this option is highly discouraged due to inefficiency** - `--file-scheme` - Sets the scheme for files From 0bf44e5d827b74b1bc4c4de20fcf957aef374526 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sun, 29 Jan 2023 00:01:21 -0500 Subject: [PATCH 11/82] UP012 --- bdfr/oauth2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index f7fa01e1..09a15215 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -82,7 +82,7 @@ def receive_connection() -> socket.socket: @staticmethod def send_message(client: socket.socket, message: str = ""): - client.send(f"HTTP/1.1 200 OK\r\n\r\n{message}".encode("utf-8")) + client.send(f"HTTP/1.1 200 OK\r\n\r\n{message}".encode()) client.close() From 0e28c7ed7cf4c582fcca84c80149c7b55c3e0f50 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 4 Feb 2023 13:24:38 -0500 Subject: [PATCH 12/82] Gfycat API Moves Gfycat to use API via site access key. Adds cachetools as dependency to reuse API keys for Gfycat/Redgifs at 95% of their TTL. Include tests to verify caching. Updates versions of requests/yt-dlp/black/isort/pytest. Added default timeout to requests calls. Adds validate-pyproject and blacken-docs to pre-commit as well as updates versions. --- .pre-commit-config.yaml | 15 +++++++-- bdfr/__main__.py | 2 +- bdfr/downloader.py | 2 +- bdfr/oauth2.py | 4 ++- bdfr/resource.py | 4 +-- bdfr/site_downloaders/base_downloader.py | 13 ++++++- bdfr/site_downloaders/gallery.py | 2 +- bdfr/site_downloaders/gfycat.py | 43 ++++++++++++++++++++---- bdfr/site_downloaders/redgifs.py | 15 +++++---- bdfr/site_downloaders/vidble.py | 2 +- pyproject.toml | 13 +++---- tests/site_downloaders/test_direct.py | 5 +-- tests/site_downloaders/test_gfycat.py | 7 ++++ tests/site_downloaders/test_redgifs.py | 7 ++++ 14 files changed, 101 insertions(+), 33 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 28bd1400..0537e57a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,13 +2,18 @@ # See https://pre-commit.com/hooks.html for more hooks repos: + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.12.1 + hooks: + - id: validate-pyproject + - repo: https://github.com/psf/black - rev: 22.12.0 + rev: 23.1.0 hooks: - id: black - repo: https://github.com/pycqa/isort - rev: 5.11.4 + rev: 5.12.0 hooks: - id: isort name: isort (python) @@ -23,3 +28,9 @@ repos: rev: v0.12.0 hooks: - id: markdownlint + + - repo: https://github.com/adamchainz/blacken-docs + rev: 1.13.0 + hooks: + - id: blacken-docs + additional_dependencies: [black>=23.1.0] diff --git a/bdfr/__main__.py b/bdfr/__main__.py index dadba517..8ae2b5b2 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -82,7 +82,7 @@ def _check_version(context, param, value): if not value or context.resilient_parsing: return current = __version__ - latest = requests.get("https://pypi.org/pypi/bdfr/json").json()["info"]["version"] + latest = requests.get("https://pypi.org/pypi/bdfr/json", timeout=10).json()["info"]["version"] print(f"You are currently using v{current} the latest is v{latest}") context.exit() diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 84cae379..7ed724cb 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -26,7 +26,7 @@ def _calc_hash(existing_file: Path): chunk_size = 1024 * 1024 - md5_hash = hashlib.md5() + md5_hash = hashlib.md5(usedforsecurity=False) with existing_file.open("rb") as file: chunk = file.read(chunk_size) while chunk: diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index ead05534..e9ca904e 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -26,7 +26,9 @@ def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str): @staticmethod def _check_scopes(wanted_scopes: set[str]): response = requests.get( - "https://www.reddit.com/api/v1/scopes.json", headers={"User-Agent": "fetch-scopes test"} + "https://www.reddit.com/api/v1/scopes.json", + headers={"User-Agent": "fetch-scopes test"}, + timeout=10, ) known_scopes = [scope for scope, data in response.json().items()] known_scopes.append("*") diff --git a/bdfr/resource.py b/bdfr/resource.py index 37fc5214..23e2da1a 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -49,7 +49,7 @@ def download(self, download_parameters: Optional[dict] = None): self.create_hash() def create_hash(self): - self.hash = hashlib.md5(self.content) + self.hash = hashlib.md5(self.content, usedforsecurity=False) def _determine_extension(self) -> Optional[str]: extension_pattern = re.compile(r".*(\..{3,5})$") @@ -68,7 +68,7 @@ def http_download(url: str, download_parameters: dict) -> Optional[bytes]: max_wait_time = 300 while True: try: - response = requests.get(url, headers=headers) + response = requests.get(url, headers=headers, timeout=10) if re.match(r"^2\d{2}", str(response.status_code)) and response.content: return response.content elif response.status_code in (408, 429): diff --git a/bdfr/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py index e4ac111c..dafa90b6 100644 --- a/bdfr/site_downloaders/base_downloader.py +++ b/bdfr/site_downloaders/base_downloader.py @@ -28,10 +28,21 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l @staticmethod def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: try: - res = requests.get(url, cookies=cookies, headers=headers) + res = requests.get(url, cookies=cookies, headers=headers, timeout=10) except requests.exceptions.RequestException as e: logger.exception(e) raise SiteDownloaderError(f"Failed to get page {url}") if res.status_code != 200: raise ResourceNotFound(f"Server responded with {res.status_code} to {url}") return res + + @staticmethod + def post_url(url: str, cookies: dict = None, headers: dict = None, payload: dict = None) -> requests.Response: + try: + res = requests.post(url, cookies=cookies, headers=headers, json=payload, timeout=10) + except requests.exceptions.RequestException as e: + logger.exception(e) + raise SiteDownloaderError(f"Failed to post to {url}") + if res.status_code != 200: + raise ResourceNotFound(f"Server responded with {res.status_code} to {url}") + return res diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index 6f004104..2e7002f1 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -42,7 +42,7 @@ def _get_links(id_dict: list[dict]) -> list[str]: possible_extensions = (".jpg", ".png", ".gif", ".gifv", ".jpeg") for extension in possible_extensions: test_url = f"https://i.redd.it/{image_id}{extension}" - response = requests.head(test_url) + response = requests.head(test_url, timeout=10) if response.status_code == 200: out.append(test_url) break diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index 45246894..57194cf3 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -5,7 +5,7 @@ import re from typing import Optional -from bs4 import BeautifulSoup +from cachetools import TTLCache, cached from praw.models import Submission from bdfr.exceptions import SiteDownloaderError @@ -21,6 +21,20 @@ def __init__(self, post: Submission): def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: return super().find_resources(authenticator) + @staticmethod + @cached(cache=TTLCache(maxsize=5, ttl=3420)) + def _get_auth_token() -> str: + headers = { + "content-type": "text/plain;charset=UTF-8", + "host": "weblogin.gfycat.com", + "origin": "https://gfycat.com", + } + payload = {"access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2M95Rfa5FLLhPFucu8H5HTzeutyAa"} + token = json.loads( + Gfycat.post_url("https://weblogin.gfycat.com/oauth/webtoken", headers=headers, payload=payload).text + )["access_token"] + return token + @staticmethod def _get_link(url: str) -> set[str]: gfycat_id = re.match(r".*/(.*?)(?:/?|-.*|\..{3-4})$", url).group(1) @@ -28,18 +42,33 @@ def _get_link(url: str) -> set[str]: response = Gfycat.retrieve_url(url) if re.search(r"(redgifs|gifdeliverynetwork)", response.url): - url = url.lower() # Fixes error with old gfycat/redgifs links + url = url.lower() return Redgifs._get_link(url) - soup = BeautifulSoup(response.text, "html.parser") - content = soup.find("script", attrs={"data-react-helmet": "true", "type": "application/ld+json"}) + auth_token = Gfycat._get_auth_token() + if not auth_token: + raise SiteDownloaderError("Unable to retrieve Gfycat API token") + + headers = { + "referer": "https://gfycat.com/", + "origin": "https://gfycat.com", + "content-type": "application/json", + "Authorization": f"Bearer {auth_token}", + } + content = Gfycat.retrieve_url(f"https://api.gfycat.com/v1/gfycats/{gfycat_id}", headers=headers) + + if content is None: + raise SiteDownloaderError("Could not read the API source") try: - out = json.loads(content.contents[0])["video"]["contentUrl"] + response_json = json.loads(content.text) + except json.JSONDecodeError as e: + raise SiteDownloaderError(f"Received data was not valid JSON: {e}") + + try: + out = response_json["gfyItem"]["mp4Url"] except (IndexError, KeyError, AttributeError) as e: raise SiteDownloaderError(f"Failed to download Gfycat link {url}: {e}") - except json.JSONDecodeError as e: - raise SiteDownloaderError(f"Did not receive valid JSON data: {e}") return { out, } diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 9c469bc5..29427163 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -6,6 +6,7 @@ from typing import Optional import requests +from cachetools import TTLCache, cached from praw.models import Submission from bdfr.exceptions import SiteDownloaderError @@ -22,6 +23,12 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l media_urls = self._get_link(self.post.url) return [Resource(self.post, m, Resource.retry_download(m), None) for m in media_urls] + @staticmethod + @cached(cache=TTLCache(maxsize=5, ttl=82080)) + def _get_auth_token() -> str: + token = json.loads(Redgifs.retrieve_url("https://api.redgifs.com/v2/auth/temporary").text)["token"] + return token + @staticmethod def _get_id(url: str) -> str: try: @@ -38,7 +45,7 @@ def _get_id(url: str) -> str: def _get_link(url: str) -> set[str]: redgif_id = Redgifs._get_id(url) - auth_token = json.loads(Redgifs.retrieve_url("https://api.redgifs.com/v2/auth/temporary").text)["token"] + auth_token = Redgifs._get_auth_token() if not auth_token: raise SiteDownloaderError("Unable to retrieve Redgifs API token") @@ -48,7 +55,6 @@ def _get_link(url: str) -> set[str]: "content-type": "application/json", "Authorization": f"Bearer {auth_token}", } - content = Redgifs.retrieve_url(f"https://api.redgifs.com/v2/gifs/{redgif_id}", headers=headers) if content is None: @@ -62,7 +68,7 @@ def _get_link(url: str) -> set[str]: out = set() try: if response_json["gif"]["type"] == 1: # type 1 is a video - if requests.get(response_json["gif"]["urls"]["hd"], headers=headers).ok: + if requests.head(response_json["gif"]["urls"]["hd"], headers=headers, timeout=10).ok: out.add(response_json["gif"]["urls"]["hd"]) else: out.add(response_json["gif"]["urls"]["sd"]) @@ -80,7 +86,4 @@ def _get_link(url: str) -> set[str]: except (KeyError, AttributeError): raise SiteDownloaderError("Failed to find JSON data in page") - # Update subdomain if old one is returned - out = {re.sub("thumbs2", "thumbs3", link) for link in out} - out = {re.sub("thumbs3", "thumbs4", link) for link in out} return out diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py index aa1e9494..9ded2019 100644 --- a/bdfr/site_downloaders/vidble.py +++ b/bdfr/site_downloaders/vidble.py @@ -37,7 +37,7 @@ def get_links(url: str) -> set[str]: if not re.search(r"vidble.com/(show/|album/|watch\?v)", url): url = re.sub(r"/(\w*?)$", r"/show/\1", url) - page = requests.get(url) + page = requests.get(url, timeout=10) soup = bs4.BeautifulSoup(page.text, "html.parser") content_div = soup.find("div", attrs={"id": "ContentPlaceHolder1_divContent"}) images = content_div.find_all("img") diff --git a/pyproject.toml b/pyproject.toml index dc265b52..690b58a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,12 +25,13 @@ classifiers = [ dependencies = [ "appdirs>=1.4.4", "beautifulsoup4>=4.10.0", + "cachetools>=5.3.0", "click>=8.0.0", "dict2xml>=1.7.0", "praw>=7.2.0", "pyyaml>=5.4.1", - "requests>=2.25.1", - "yt-dlp>=2022.11.11", + "requests>=2.28.2", + "yt-dlp>=2023.1.6", ] dynamic = ["version"] @@ -41,11 +42,11 @@ data-files = {"config" = ["bdfr/default_config.cfg",]} [project.optional-dependencies] dev = [ - "black>=22.12.0", + "black>=23.1.0", "Flake8-pyproject>=1.2.2", - "isort>=5.11.4", - "pre-commit>=2.20.0", - "pytest>=7.1.0", + "isort>=5.12.0", + "pre-commit>=3.0.4", + "pytest>=7.2.1", "tox>=3.27.1", ] diff --git a/tests/site_downloaders/test_direct.py b/tests/site_downloaders/test_direct.py index 42e16237..ada5ef1c 100644 --- a/tests/site_downloaders/test_direct.py +++ b/tests/site_downloaders/test_direct.py @@ -14,10 +14,7 @@ ("test_url", "expected_hash"), ( ("https://i.redd.it/q6ebualjxzea1.jpg", "6ec154859c777cb401132bb991cb3635"), - ( - "https://file-examples.com/wp-content/uploads/2017/11/file_example_MP3_700KB.mp3", - "35257826e20227a8a57d0e5a410e03c7", - ), + ("https://filesamples.com/samples/audio/mp3/sample3.mp3", "d30a2308f188cbb11d74cf20c357891c"), ), ) def test_download_resource(test_url: str, expected_hash: str): diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 2821a7e9..545d273a 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -9,6 +9,13 @@ from bdfr.site_downloaders.gfycat import Gfycat +@pytest.mark.online +def test_auth_cache(): + auth1 = Gfycat._get_auth_token() + auth2 = Gfycat._get_auth_token() + assert auth1 == auth2 + + @pytest.mark.online @pytest.mark.parametrize( ("test_url", "expected_url"), diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 9d1a7f54..55899992 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -10,6 +10,13 @@ from bdfr.site_downloaders.redgifs import Redgifs +@pytest.mark.online +def test_auth_cache(): + auth1 = Redgifs._get_auth_token() + auth2 = Redgifs._get_auth_token() + assert auth1 == auth2 + + @pytest.mark.parametrize( ("test_url", "expected"), ( From 55384cd0f0213567811198d61ca41ff79424fe8a Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sun, 29 Jan 2023 00:02:37 -0500 Subject: [PATCH 13/82] UP032 --- bdfr/download_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/download_filter.py b/bdfr/download_filter.py index 00be7f6d..5814669d 100644 --- a/bdfr/download_filter.py +++ b/bdfr/download_filter.py @@ -33,7 +33,7 @@ def _check_extension(self, resource_extension: str) -> bool: if not self.excluded_extensions: return True combined_extensions = "|".join(self.excluded_extensions) - pattern = re.compile(r".*({})$".format(combined_extensions)) + pattern = re.compile(rf".*({combined_extensions})$") if re.match(pattern, resource_extension): logger.log(9, f'Url "{resource_extension}" matched with "{pattern}"') return False @@ -44,7 +44,7 @@ def _check_domain(self, url: str) -> bool: if not self.excluded_domains: return True combined_domains = "|".join(self.excluded_domains) - pattern = re.compile(r"https?://.*({}).*".format(combined_domains)) + pattern = re.compile(rf"https?://.*({combined_domains}).*") if re.match(pattern, url): logger.log(9, f'Url "{url}" matched with "{pattern}"') return False From 183f592ad884675fb9b20b779ef58b029ebb322b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 14 Feb 2023 14:37:55 +1000 Subject: [PATCH 14/82] Update CONTRIBUTING bug report requirements --- docs/CONTRIBUTING.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 53fcb03a..841204d9 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -10,12 +10,16 @@ All communication on GitHub, Discord, email, or any other medium must conform to **Before opening a new issue**, be sure that no issues regarding your problem already exist. If a similar issue exists, try to contribute to the issue. +**If you are asking a question** about the functioning of the BDFR or the interface, please use the discussions page. Bug reports are not the right medium for asking and answering questions, and the discussions page makes it much easier to discuss, answer, and save questions and responses for others going forwards. + ### Bugs When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug. If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug. +Adding this information is **not optional**. If a bug report is opened without this information, it cannot be replicated by developers. The logs will be asked for once and if they are not supplied, the issue will be closed due to lack of information. + ### Feature requests In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers. From 673076ed2e42faae15292aef6db3f859ba1b1115 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Tue, 14 Feb 2023 14:41:16 +1000 Subject: [PATCH 15/82] Add requirement for template --- .github/ISSUE_TEMPLATE/bug_report.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index e05bb369..b4cae852 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,9 +7,10 @@ assignees: '' --- +- [ ] I have read the [Opening an issue](https://github.com/aliparlakci/bulk-downloader-for-reddit/blob/master/docs/CONTRIBUTING.md#opening-an-issue) - [ ] I am reporting a bug. - [ ] I am running the latest version of BDfR -- [ ] I have read the [Opening an issue](https://github.com/aliparlakci/bulk-downloader-for-reddit/blob/master/docs/CONTRIBUTING.md#opening-an-issue) +- [ ] I am not asking a question about the BDFR (please use Discussions for this) ## Description From 9eeff73a12f0130aa8da904330560bad2fd2c2fa Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 17 Feb 2023 14:50:40 -0500 Subject: [PATCH 16/82] Update yt-dlp Update yt-dlp to 2023.2.17 as it contains updates to vreddit downloading for better coverage. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 690b58a2..9a2b5249 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "praw>=7.2.0", "pyyaml>=5.4.1", "requests>=2.28.2", - "yt-dlp>=2023.1.6", + "yt-dlp>=2023.2.17", ] dynamic = ["version"] From a3b9e78f53ead05e1d574b99487c464fc5ffd215 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 17 Feb 2023 15:17:25 -0500 Subject: [PATCH 17/82] Update tests Seems the thumbs subdomain was changed for these cases. --- tests/site_downloaders/test_gfycat.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index f51f68a2..220c8071 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -21,12 +21,15 @@ def test_auth_cache(): ( ("https://gfycat.com/definitivecaninecrayfish", "https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4"), ("https://gfycat.com/dazzlingsilkyiguana", "https://giant.gfycat.com/DazzlingSilkyIguana.mp4"), - ("https://gfycat.com/WearyComposedHairstreak", "https://thumbs4.redgifs.com/WearyComposedHairstreak.mp4"), + ("https://gfycat.com/WearyComposedHairstreak", "https://thumbs44.redgifs.com/WearyComposedHairstreak.mp4"), ( "https://thumbs.gfycat.com/ComposedWholeBullfrog-size_restricted.gif", - "https://thumbs4.redgifs.com/ComposedWholeBullfrog.mp4", + "https://thumbs44.redgifs.com/ComposedWholeBullfrog.mp4", + ), + ( + "https://giant.gfycat.com/ComposedWholeBullfrog.mp4", + "https://thumbs44.redgifs.com/ComposedWholeBullfrog.mp4", ), - ("https://giant.gfycat.com/ComposedWholeBullfrog.mp4", "https://thumbs4.redgifs.com/ComposedWholeBullfrog.mp4"), ), ) def test_get_link(test_url: str, expected_url: str): From 5c57de7c7d54d3919afcb7e22bef2ce253f305c0 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 15:58:05 -0500 Subject: [PATCH 18/82] B907 Cleanup/updates Cleanup some double quoted locations based on bugbear B907 and add same formatting to some other locations the emphasis may be helpful. --- bdfr/__main__.py | 2 +- bdfr/archiver.py | 2 +- bdfr/configuration.py | 4 ++-- bdfr/connector.py | 6 +++--- bdfr/download_filter.py | 4 ++-- bdfr/file_name_formatter.py | 2 +- bdfr/oauth2.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 3a4dce3a..193015e7 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -182,7 +182,7 @@ def cli_completion(shell: str, uninstall: bool): Completion(shell).uninstall() return if shell not in ("all", "bash", "fish", "zsh"): - print(f"{shell} is not a valid option.") + print(f"{shell!r} is not a valid option.") print("Options: all, bash, fish, zsh") return if click.confirm(f"Would you like to install {shell} completions for BDFR"): diff --git a/bdfr/archiver.py b/bdfr/archiver.py index dd4b06eb..7118ba65 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -94,7 +94,7 @@ def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comme elif self.args.format == "yaml": self._write_entry_yaml(archive_entry) else: - raise ArchiverError(f"Unknown format {self.args.format} given") + raise ArchiverError(f"Unknown format {self.args.format!r} given") logger.info(f"Record for entry item {praw_item.id} written to disk") def _write_entry_json(self, entry: BaseArchiveEntry): diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 4aba15f3..b7e7a251 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -63,7 +63,7 @@ def process_click_arguments(self, context: click.Context): self.parse_yaml_options(context.params["opts"]) for arg_key in context.params.keys(): if not hasattr(self, arg_key): - logger.warning(f"Ignoring an unknown CLI argument: {arg_key}") + logger.warning(f"Ignoring an unknown CLI argument: {arg_key!r}") continue val = context.params[arg_key] if val is None or val == (): @@ -84,6 +84,6 @@ def parse_yaml_options(self, file_path: str): return for arg_key, val in opts.items(): if not hasattr(self, arg_key): - logger.warning(f"Ignoring an unknown YAML argument: {arg_key}") + logger.warning(f"Ignoring an unknown YAML argument: {arg_key!r}") continue setattr(self, arg_key, val) diff --git a/bdfr/connector.py b/bdfr/connector.py index 2a87c18f..f01c752b 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -115,7 +115,7 @@ def read_config(self): self.args.filename_restriction_scheme = self.cfg_parser.get( "DEFAULT", "filename_restriction_scheme", fallback=None ) - logger.debug(f"Setting filename restriction scheme to '{self.args.filename_restriction_scheme}'") + logger.debug(f"Setting filename restriction scheme to {self.args.filename_restriction_scheme!r}") # Update config on disk with Path(self.config_location).open(mode="w") as file: self.cfg_parser.write(file) @@ -239,7 +239,7 @@ def sanitise_subreddit_name(subreddit: str) -> str: pattern = re.compile(r"^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)/?$") match = re.match(pattern, subreddit) if not match: - raise errors.BulkDownloaderException(f"Could not find subreddit name in string {subreddit}") + raise errors.BulkDownloaderException(f"Could not find subreddit name in string {subreddit!r}") return match.group(1) @staticmethod @@ -285,7 +285,7 @@ def get_subreddits(self) -> list[praw.models.ListingGenerator]: ) ) logger.debug( - f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"' + f"Added submissions from subreddit {reddit} with the search term {self.args.search!r}" ) else: out.append(self.create_filtered_listing_generator(reddit)) diff --git a/bdfr/download_filter.py b/bdfr/download_filter.py index 5814669d..518c6663 100644 --- a/bdfr/download_filter.py +++ b/bdfr/download_filter.py @@ -35,7 +35,7 @@ def _check_extension(self, resource_extension: str) -> bool: combined_extensions = "|".join(self.excluded_extensions) pattern = re.compile(rf".*({combined_extensions})$") if re.match(pattern, resource_extension): - logger.log(9, f'Url "{resource_extension}" matched with "{pattern}"') + logger.log(9, f"Url {resource_extension!r} matched with {pattern!r}") return False else: return True @@ -46,7 +46,7 @@ def _check_domain(self, url: str) -> bool: combined_domains = "|".join(self.excluded_domains) pattern = re.compile(rf"https?://.*({combined_domains}).*") if re.match(pattern, url): - logger.log(9, f'Url "{url}" matched with "{pattern}"') + logger.log(9, f"Url {url!r} matched with {pattern!r}") return False else: return True diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 1eac6d1a..942d5b54 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -37,7 +37,7 @@ def __init__( restriction_scheme: Optional[str] = None, ): if not self.validate_string(file_format_string): - raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string') + raise BulkDownloaderException(f"{file_format_string!r} is not a valid format string") self.file_format_string = file_format_string self.directory_format_string: list[str] = directory_format_string.split("/") self.time_format_string = time_format_string diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 1051e5cb..3cd4951f 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -33,7 +33,7 @@ def _check_scopes(wanted_scopes: set[str]): known_scopes.append("*") for scope in wanted_scopes: if scope not in known_scopes: - raise BulkDownloaderException(f"Scope {scope} is not known to reddit") + raise BulkDownloaderException(f"Scope {scope!r} is not known to reddit") @staticmethod def split_scopes(scopes: str) -> set[str]: From 98aa3d7cb65cc79b4a875560303e39737306255b Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 16:06:32 -0500 Subject: [PATCH 19/82] Quote cleanup cleanup of some strings to prioritize outer double quotes when both are present or switch to double quotes when only single is present. --- README.md | 10 +++++----- bdfr/connector.py | 4 ++-- bdfr/downloader.py | 2 +- bdfr/oauth2.py | 4 ++-- bdfr/site_downloaders/download_factory.py | 2 +- bdfr/site_downloaders/redgifs.py | 6 ++---- pyproject.toml | 2 +- scripts/extract_failed_ids.sh | 12 ++++++------ scripts/extract_successful_ids.sh | 14 +++++++------- scripts/print_summary.sh | 16 ++++++++-------- .../test_download_integration.py | 2 +- 11 files changed, 36 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 37eaff46..e7ed4921 100644 --- a/README.md +++ b/README.md @@ -80,11 +80,11 @@ bdfr download ./path/to/output --user reddituser --submitted -L 100 ``` ```bash -bdfr download ./path/to/output --user me --saved --authenticate -L 25 --file-scheme '{POSTID}' +bdfr download ./path/to/output --user me --saved --authenticate -L 25 --file-scheme "{POSTID}" ``` ```bash -bdfr download ./path/to/output --subreddit 'Python, all, mindustry' -L 10 --make-hard-links +bdfr download ./path/to/output --subreddit "Python, all, mindustry" -L 10 --make-hard-links ``` ```bash @@ -92,7 +92,7 @@ bdfr archive ./path/to/output --user reddituser --submitted --all-comments --com ``` ```bash -bdfr archive ./path/to/output --subreddit all --format yaml -L 500 --folder-scheme '' +bdfr archive ./path/to/output --subreddit all --format yaml -L 500 --folder-scheme "" ``` Alternatively, you can pass options through a YAML file. @@ -191,13 +191,13 @@ The following options are common between both the `archive` and `download` comma - This is the name of a multireddit to add as a source - Can be specified multiple times - This can be done by using `-m` multiple times - - Multireddits can also be used to provide CSV multireddits e.g. `-m 'chess, favourites'` + - Multireddits can also be used to provide CSV multireddits e.g. `-m "chess, favourites"` - The specified multireddits must all belong to the user specified with the `--user` option - `-s, --subreddit` - This adds a subreddit as a source - Can be used mutliple times - This can be done by using `-s` multiple times - - Subreddits can also be used to provide CSV subreddits e.g. `-m 'all, python, mindustry'` + - Subreddits can also be used to provide CSV subreddits e.g. `-m "all, python, mindustry"` - `-t, --time` - This is the time filter that will be applied to all applicable sources - This option does not apply to upvoted or saved posts when scraping from these sources diff --git a/bdfr/connector.py b/bdfr/connector.py index f01c752b..a6ba23fa 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -125,7 +125,7 @@ def parse_disabled_modules(self): disabled_modules = self.split_args_input(disabled_modules) disabled_modules = {name.strip().lower() for name in disabled_modules} self.args.disable_module = disabled_modules - logger.debug(f'Disabling the following modules: {", ".join(self.args.disable_module)}') + logger.debug(f"Disabling the following modules: {', '.join(self.args.disable_module)}") def create_reddit_instance(self): if self.args.authenticate: @@ -301,7 +301,7 @@ def resolve_user_name(self, in_name: str) -> str: logger.log(9, f"Resolved user to {resolved_name}") return resolved_name else: - logger.warning('To use "me" as a user, an authenticated Reddit instance must be used') + logger.warning("To use 'me' as a user, an authenticated Reddit instance must be used") else: return in_name diff --git a/bdfr/downloader.py b/bdfr/downloader.py index 70068d23..b77c72a7 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -66,7 +66,7 @@ def _download_submission(self, submission: praw.models.Submission): ): logger.debug( f"Submission {submission.id} in {submission.subreddit.display_name} skipped" - f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user' + f" due to {submission.author.name if submission.author else 'DELETED'} being an ignored user" ) return elif self.args.min_score and submission.score < self.args.min_score: diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 3cd4951f..00074d75 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -59,10 +59,10 @@ def retrieve_new_token(self) -> str: if state != params["state"]: self.send_message(client) - raise RedditAuthenticationError(f'State mismatch in OAuth2. Expected: {state} Received: {params["state"]}') + raise RedditAuthenticationError(f"State mismatch in OAuth2. Expected: {state} Received: {params['state']}") elif "error" in params: self.send_message(client) - raise RedditAuthenticationError(f'Error in OAuth2: {params["error"]}') + raise RedditAuthenticationError(f"Error in OAuth2: {params['error']}") self.send_message(client, "") refresh_token = reddit.auth.authorize(params["code"]) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 921a1d19..d4fd83a7 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -82,7 +82,7 @@ def is_web_resource(url: str) -> bool: "php3", "xhtml", ) - if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url): + if re.match(rf"(?i).*/.*\.({'|'.join(web_extensions)})$", url): return True else: return False diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 92e73f31..8feab2ef 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -72,10 +72,8 @@ def _get_link(url: str) -> set[str]: else: out.add(response_json["gif"]["urls"]["sd"]) elif response_json["gif"]["type"] == 2: # type 2 is an image - if response_json["gif"]["gallery"]: - content = Redgifs.retrieve_url( - f'https://api.redgifs.com/v2/gallery/{response_json["gif"]["gallery"]}' - ) + if gallery := response_json["gif"]["gallery"]: + content = Redgifs.retrieve_url(f"https://api.redgifs.com/v2/gallery/{gallery}") response_json = json.loads(content.text) out = {p["urls"]["hd"] for p in response_json["gifs"]} else: diff --git a/pyproject.toml b/pyproject.toml index 9a2b5249..d0916997 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ dynamic = ["version"] [tool.setuptools] -dynamic = {"version" = {attr = 'bdfr.__version__'}} +dynamic = {"version" = {attr = "bdfr.__version__"}} packages = ["bdfr", "bdfr.archive_entry", "bdfr.site_downloaders", "bdfr.site_downloaders.fallback_downloaders",] data-files = {"config" = ["bdfr/default_config.cfg",]} diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 64d1e721..0543ef0c 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -3,14 +3,14 @@ if [ -e "$1" ]; then file="$1" else - echo 'CANNOT FIND LOG FILE' + echo "CANNOT FIND LOG FILE" exit 1 fi { - grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; - grep 'Failed to download resource' "$file" | awk '{ print $15 }' ; - grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; - grep 'Failed to write file' "$file" | awk '{ print $14 }' ; - grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ; + grep "Could not download submission" "$file" | awk "{ print $12 }" | rev | cut -c 2- | rev ; + grep "Failed to download resource" "$file" | awk "{ print $15 }" ; + grep "failed to download submission" "$file" | awk "{ print $14 }" | rev | cut -c 2- | rev ; + grep "Failed to write file" "$file" | awk "{ print $14 }" ; + grep "skipped due to disabled module" "$file" | awk "{ print $9 }" ; } diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index f2128e5a..159fd826 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -3,15 +3,15 @@ if [ -e "$1" ]; then file="$1" else - echo 'CANNOT FIND LOG FILE' + echo "CANNOT FIND LOG FILE" exit 1 fi { - grep 'Downloaded submission' "$file" | awk '{ print $(NF-2) }' ; - grep 'Resource hash' "$file" | awk '{ print $(NF-2) }' ; - grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ; - grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ; - grep 'Hard link made' "$file" | awk '{ print $(NF) }' ; - grep 'filtered due to score' "$file" | awk '{ print $9 }' + grep "Downloaded submission" "$file" | awk "{ print $(NF-2) }" ; + grep "Resource hash" "$file" | awk "{ print $(NF-2) }" ; + grep "Download filter" "$file" | awk "{ print $(NF-3) }" ; + grep "already exists, continuing" "$file" | awk "{ print $(NF-3) }" ; + grep "Hard link made" "$file" | awk "{ print $(NF) }" ; + grep "filtered due to score" "$file" | awk "{ print $9 }" } diff --git a/scripts/print_summary.sh b/scripts/print_summary.sh index 052ef1e6..ec547b7f 100755 --- a/scripts/print_summary.sh +++ b/scripts/print_summary.sh @@ -3,14 +3,14 @@ if [ -e "$1" ]; then file="$1" else - echo 'CANNOT FIND LOG FILE' + echo "CANNOT FIND LOG FILE" exit 1 fi -echo "Downloaded submissions: $( grep -c 'Downloaded submission' "$file" )" -echo "Failed downloads: $( grep -c 'failed to download submission' "$file" )" -echo "Files already downloaded: $( grep -c 'already exists, continuing' "$file" )" -echo "Hard linked submissions: $( grep -c 'Hard link made' "$file" )" -echo "Excluded submissions: $( grep -c 'in exclusion list' "$file" )" -echo "Files with existing hash skipped: $( grep -c 'downloaded elsewhere' "$file" )" -echo "Submissions from excluded subreddits: $( grep -c 'in skip list' "$file" )" +echo "Downloaded submissions: $( grep -c 'Downloaded submission' '$file' )" +echo "Failed downloads: $( grep -c 'failed to download submission' '$file' )" +echo "Files already downloaded: $( grep -c 'already exists, continuing' '$file' )" +echo "Hard linked submissions: $( grep -c 'Hard link made' '$file' )" +echo "Excluded submissions: $( grep -c 'in exclusion list' '$file' )" +echo "Files with existing hash skipped: $( grep -c 'downloaded elsewhere' '$file' )" +echo "Submissions from excluded subreddits: $( grep -c 'in skip list' '$file' )" diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index e6545054..a2b29d9d 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -185,7 +185,7 @@ def test_cli_download_user_data_bad_me_unauthenticated(test_args: list[str], tmp test_args = create_basic_args_for_download_runner(test_args, tmp_path) result = runner.invoke(cli, test_args) assert result.exit_code == 0 - assert 'To use "me" as a user, an authenticated Reddit instance must be used' in result.output + assert "To use 'me' as a user, an authenticated Reddit instance must be used" in result.output @pytest.mark.online From 8f9bed0874008d7c572f0d6d1a90e16b7d5dc69c Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 17:36:00 -0500 Subject: [PATCH 20/82] Harden requests Moves bare requests out of site downloaders into the base_downloader and add timeout exception handling now that there is a default timeout. --- bdfr/__main__.py | 10 +++++++--- bdfr/oauth2.py | 13 ++++++++----- bdfr/site_downloaders/base_downloader.py | 20 +++++++++++++++++++- bdfr/site_downloaders/gallery.py | 4 +--- bdfr/site_downloaders/redgifs.py | 3 +-- bdfr/site_downloaders/vidble.py | 3 +-- 6 files changed, 37 insertions(+), 16 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 3a4dce3a..35cf643b 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -77,12 +77,16 @@ def wrap(func): return wrap -def _check_version(context, param, value): +def _check_version(context, _param, value): if not value or context.resilient_parsing: return current = __version__ - latest = requests.get("https://pypi.org/pypi/bdfr/json", timeout=10).json()["info"]["version"] - print(f"You are currently using v{current} the latest is v{latest}") + try: + latest = requests.get("https://pypi.org/pypi/bdfr/json", timeout=10).json()["info"]["version"] + print(f"You are currently using v{current} the latest is v{latest}") + except TimeoutError: + logger.exception(f"Timeout reached fetching current version from Pypi - BDFR v{current}") + raise context.exit() diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 1051e5cb..5a01e638 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -24,11 +24,14 @@ def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str): @staticmethod def _check_scopes(wanted_scopes: set[str]): - response = requests.get( - "https://www.reddit.com/api/v1/scopes.json", - headers={"User-Agent": "fetch-scopes test"}, - timeout=10, - ) + try: + response = requests.get( + "https://www.reddit.com/api/v1/scopes.json", + headers={"User-Agent": "fetch-scopes test"}, + timeout=10, + ) + except TimeoutError: + raise BulkDownloaderException("Reached timeout fetching scopes") known_scopes = [scope for scope, data in response.json().items()] known_scopes.append("*") for scope in wanted_scopes: diff --git a/bdfr/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py index f2f573ea..8b4f892d 100644 --- a/bdfr/site_downloaders/base_downloader.py +++ b/bdfr/site_downloaders/base_downloader.py @@ -31,8 +31,11 @@ def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> reques except requests.exceptions.RequestException as e: logger.exception(e) raise SiteDownloaderError(f"Failed to get page {url}") + except TimeoutError as e: + logger.exception(e) + raise SiteDownloaderError(f"Timeout reached attempting to get page {url}") if res.status_code != 200: - raise ResourceNotFound(f"Server responded with {res.status_code} to {url}") + raise ResourceNotFound(f"Server responded with {res.status_code} at {url}") return res @staticmethod @@ -42,6 +45,21 @@ def post_url(url: str, cookies: dict = None, headers: dict = None, payload: dict except requests.exceptions.RequestException as e: logger.exception(e) raise SiteDownloaderError(f"Failed to post to {url}") + except TimeoutError as e: + logger.exception(e) + raise SiteDownloaderError(f"Timeout reached attempting to post to page {url}") if res.status_code != 200: raise ResourceNotFound(f"Server responded with {res.status_code} to {url}") return res + + @staticmethod + def head_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: + try: + res = requests.head(url, cookies=cookies, headers=headers, timeout=10) + except requests.exceptions.RequestException as e: + logger.exception(e) + raise SiteDownloaderError(f"Failed to check head at {url}") + except TimeoutError as e: + logger.exception(e) + raise SiteDownloaderError(f"Timeout reached attempting to check head at {url}") + return res diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index dab357cb..a2f65f4f 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -3,7 +3,6 @@ import logging from typing import Optional -import requests from praw.models import Submission from bdfr.exceptions import SiteDownloaderError @@ -41,8 +40,7 @@ def _get_links(id_dict: list[dict]) -> list[str]: possible_extensions = (".jpg", ".png", ".gif", ".gifv", ".jpeg") for extension in possible_extensions: test_url = f"https://i.redd.it/{image_id}{extension}" - response = requests.head(test_url, timeout=10) - if response.status_code == 200: + if Gallery.head_url(test_url).status_code == 200: out.append(test_url) break return out diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index 92e73f31..bdc6ac36 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -4,7 +4,6 @@ import re from typing import Optional -import requests from cachetools import TTLCache, cached from praw.models import Submission @@ -67,7 +66,7 @@ def _get_link(url: str) -> set[str]: out = set() try: if response_json["gif"]["type"] == 1: # type 1 is a video - if requests.head(response_json["gif"]["urls"]["hd"], headers=headers, timeout=10).ok: + if Redgifs.head_url(response_json["gif"]["urls"]["hd"], headers=headers).status_code == 200: out.add(response_json["gif"]["urls"]["hd"]) else: out.add(response_json["gif"]["urls"]["sd"]) diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py index 6b82ab55..9fdcecd6 100644 --- a/bdfr/site_downloaders/vidble.py +++ b/bdfr/site_downloaders/vidble.py @@ -6,7 +6,6 @@ from typing import Optional import bs4 -import requests from praw.models import Submission from bdfr.exceptions import SiteDownloaderError @@ -36,7 +35,7 @@ def get_links(url: str) -> set[str]: if not re.search(r"vidble.com/(show/|album/|watch\?v)", url): url = re.sub(r"/(\w*?)$", r"/show/\1", url) - page = requests.get(url, timeout=10) + page = Vidble.retrieve_url(url) soup = bs4.BeautifulSoup(page.text, "html.parser") content_div = soup.find("div", attrs={"id": "ContentPlaceHolder1_divContent"}) images = content_div.find_all("img") From 80c66c8c78939aba2edc1b969ea5c3042975812c Mon Sep 17 00:00:00 2001 From: Soulsuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Sat, 18 Feb 2023 23:59:32 -0500 Subject: [PATCH 21/82] Erome fixes Fixes searched class to not include thumbnails of other albums. --- bdfr/site_downloaders/erome.py | 2 +- tests/site_downloaders/test_erome.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index 69799dbc..d1411602 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -37,7 +37,7 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l def _get_links(url: str) -> set[str]: page = Erome.retrieve_url(url) soup = bs4.BeautifulSoup(page.text, "html.parser") - front_images = soup.find_all("img", attrs={"class": "lasyload"}) + front_images = soup.find_all("img", attrs={"class": "img-front"}) out = [im.get("data-src") for im in front_images] videos = soup.find_all("source") diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 9de9a4cc..7dd31813 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -39,6 +39,7 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): ( ("https://www.erome.com/a/vqtPuLXh", 1), ("https://www.erome.com/a/4tP3KI6F", 1), + ("https://www.erome.com/a/WNyK674a", 41), ), ) def test_download_resource(test_url: str, expected_hashes_len: int): From 987172c1ceafd69944735f53f6fbd6008432eec3 Mon Sep 17 00:00:00 2001 From: Thomas <71355143+thomas694@users.noreply.github.com> Date: Thu, 23 Feb 2023 00:32:02 +0100 Subject: [PATCH 22/82] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 37eaff46..fef3b252 100644 --- a/README.md +++ b/README.md @@ -408,7 +408,7 @@ Modules can be disabled through the command line interface for the BDFR or more - `Vidble` - `VReddit` (Reddit Video Post) - `Youtube` -- `YoutubeDlFallback` +- `YtdlpFallback` (Youtube DL Fallback) ### Rate Limiting From 47fb5d38ada2c1bd90d25b8904efb70cdc42726c Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 24 Feb 2023 14:48:33 -0500 Subject: [PATCH 23/82] Bandit fixes Fixes S101 in connector Noqa S104 in oauth2 --- bdfr/connector.py | 3 ++- bdfr/oauth2.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index a6ba23fa..0bcfbb57 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -433,7 +433,8 @@ def check_subreddit_status(subreddit: praw.models.Subreddit): if subreddit.display_name in ("all", "friends"): return try: - assert subreddit.id + if subreddit.id: + return except prawcore.NotFound: raise errors.BulkDownloaderException(f"Source {subreddit.display_name} cannot be found") except prawcore.Redirect: diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index be96f2c0..c9ee56af 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -75,7 +75,7 @@ def retrieve_new_token(self) -> str: def receive_connection() -> socket.socket: server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(("0.0.0.0", 7634)) + server.bind(("0.0.0.0", 7634)) # noqa: S104 logger.log(9, "Server listening on 0.0.0.0:7634") server.listen(1) From 005454a5c2e0cf482ca5101f4eccbc9a86e923e1 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 24 Feb 2023 21:44:16 -0500 Subject: [PATCH 24/82] Don't create directory if not needed Moves creation of parent directories after dupe check so directories are not made if not needed. --- bdfr/downloader.py | 3 ++- .../integration_tests/test_download_integration.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bdfr/downloader.py b/bdfr/downloader.py index b77c72a7..bd242b3c 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -123,12 +123,12 @@ def _download_submission(self, submission: praw.models.Submission): ) return resource_hash = res.hash.hexdigest() - destination.parent.mkdir(parents=True, exist_ok=True) if resource_hash in self.master_hash_list: if self.args.no_dupes: logger.info(f"Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere") return elif self.args.make_hard_links: + destination.parent.mkdir(parents=True, exist_ok=True) try: destination.hardlink_to(self.master_hash_list[resource_hash]) except AttributeError: @@ -138,6 +138,7 @@ def _download_submission(self, submission: praw.models.Submission): f" in submission {submission.id}" ) return + destination.parent.mkdir(parents=True, exist_ok=True) try: with destination.open("wb") as file: file.write(res.content) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index a2b29d9d..711f9c08 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -439,3 +439,17 @@ def test_cli_download_explicit_filename_restriction_scheme(test_args: list[str], assert result.exit_code == 0 assert "Downloaded submission" in result.output assert "Forcing Windows-compatible filenames" in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.parametrize("test_args", (["--link", "ehqt2g", "--link", "ehtuv8", "--no-dupes"],)) +def test_cli_download_no_empty_dirs(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_download_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert "downloaded elsewhere" in result.output + assert Path(tmp_path, "EmpireDidNothingWrong").exists() + assert not Path(tmp_path, "StarWarsEU").exists() From dd283130e36632df943c6b1d7e3b9b2b8ee67f84 Mon Sep 17 00:00:00 2001 From: Soulsuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Mon, 27 Feb 2023 15:53:47 -0500 Subject: [PATCH 25/82] Imgur fixes Update regex for links styled as direct and album --- bdfr/site_downloaders/imgur.py | 2 +- tests/site_downloaders/test_imgur.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index 964fa621..cf15c3c0 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -40,7 +40,7 @@ def _get_data(link: str) -> dict: if link.endswith("/"): link = link.removesuffix("/") if re.search(r".*/(.*?)(gallery/|a/)", link): - imgur_id = re.match(r".*/(?:gallery/|a/)(.*?)(?:/.*)?$", link).group(1) + imgur_id = re.match(r".*/(?:gallery/|a/)(.*?)(?:/.*|\..{3,4})?$", link).group(1) link = f"https://api.imgur.com/3/album/{imgur_id}" else: imgur_id = re.match(r".*/(.*?)(?:_d)?(?:\..{0,})?$", link).group(1) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 50ac4c96..8881c319 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -49,6 +49,7 @@ ("https://imgur.com/a/1qzfWtY/mp4", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), ("https://imgur.com/a/1qzfWtY/spqr", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), ("https://i.imgur.com/expO7Rc.gifv", ("e309f98158fc98072eb2ae68f947f421",)), + ("https://i.imgur.com/a/aqpiMuL.gif", ("5b2a9a5218bf43dc26ba41389410c981",)), ), ) def test_find_resources(test_url: str, expected_hashes: list[str]): From 74d842e6da6eb56f4812d9b03a6f7d33ff93748b Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Wed, 1 Mar 2023 23:13:09 -0500 Subject: [PATCH 26/82] Scripts testing/fixes Adds Bats and Pester testing to for bash and powershell scripts Updates powershell scripts to match bash scripts in logic Added missing score filter lookup for powershell script --- .github/workflows/scripts-test.yml | 29 ++++++++++++ scripts/extract_failed_ids.ps1 | 26 ++++------- scripts/extract_failed_ids.sh | 2 +- scripts/extract_successful_ids.ps1 | 27 +++++------ scripts/extract_successful_ids.sh | 2 +- scripts/print_summary.ps1 | 14 ++---- scripts/print_summary.sh | 2 +- .../example_logfiles/succeed_score_filter.txt | 2 +- scripts/tests/extract_failed_ids.Tests.ps1 | 39 ++++++++++++++++ .../tests/extract_successful_ids.Tests.ps1 | 45 +++++++++++++++++++ scripts/tests/test_extract_failed_ids.bats | 7 ++- .../tests/test_extract_successful_ids.bats | 10 +++++ 12 files changed, 155 insertions(+), 50 deletions(-) create mode 100644 .github/workflows/scripts-test.yml create mode 100644 scripts/tests/extract_failed_ids.Tests.ps1 create mode 100644 scripts/tests/extract_successful_ids.Tests.ps1 diff --git a/.github/workflows/scripts-test.yml b/.github/workflows/scripts-test.yml new file mode 100644 index 00000000..a9c35b45 --- /dev/null +++ b/.github/workflows/scripts-test.yml @@ -0,0 +1,29 @@ +name: Scripts Test + +on: + push: + paths: + - "scripts/*.sh" + - "scripts/*.ps1" + pull_request: + paths: + - "scripts/*.sh" + - "scripts/*.ps1" + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: 'true' + - name: Bats tests + run: | + cd scripts/tests/ + bats/bin/bats *.bats + + - name: Pester tests + shell: pwsh + run: | + cd scripts/tests/ + Invoke-Pester -CI -PassThru . diff --git a/scripts/extract_failed_ids.ps1 b/scripts/extract_failed_ids.ps1 index 4820d04b..0b2c1617 100644 --- a/scripts/extract_failed_ids.ps1 +++ b/scripts/extract_failed_ids.ps1 @@ -1,21 +1,13 @@ -if (Test-Path -Path $args[0] -PathType Leaf) { - $file=$args[0] -} -else { - Write-Host "CANNOT FIND LOG FILE" +if (($args[0] -eq $null) -or -Not (Test-Path -Path $args[0] -PathType Leaf)) { + Write-Output "CANNOT FIND LOG FILE" Exit 1 } - -if ($null -ne $args[1]) { - $output=$args[1] - Write-Host "Outputting IDs to $output" -} -else { - $output="./failed.txt" +elseif (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] } -Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } >> $output -Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } >> $output -Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } >> $output -Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } >> $output -Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } >> $output +Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } +Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } +Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } +Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } +Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index 0543ef0c..d7629006 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ -e "$1" ]; then +if [ -e "$1" ] && [ -f "$1" ]; then file="$1" else echo "CANNOT FIND LOG FILE" diff --git a/scripts/extract_successful_ids.ps1 b/scripts/extract_successful_ids.ps1 index 70c463ba..fd00719d 100644 --- a/scripts/extract_successful_ids.ps1 +++ b/scripts/extract_successful_ids.ps1 @@ -1,21 +1,14 @@ -if (Test-Path -Path $args[0] -PathType Leaf) { - $file=$args[0] -} -else { - Write-Host "CANNOT FIND LOG FILE" +if (($args[0] -eq $null) -or -Not (Test-Path -Path $args[0] -PathType Leaf)) { + Write-Output "CANNOT FIND LOG FILE" Exit 1 } - -if ($null -ne $args[1]) { - $output=$args[1] - Write-Host "Outputting IDs to $output" -} -else { - $output="./successful.txt" +elseif (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] } -Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output -Select-String -Path $file -Pattern "Resource hash" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output -Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output -Select-String -Path $file -Pattern "already exists, continuing" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output -Select-String -Path $file -Pattern "Hard link made" | ForEach-Object { -split $_.Line | Select-Object -Last 1 } >> $output +Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } +Select-String -Path $file -Pattern "Resource hash" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } +Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } +Select-String -Path $file -Pattern "already exists, continuing" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } +Select-String -Path $file -Pattern "Hard link made" | ForEach-Object { -split $_.Line | Select-Object -Last 1 } +Select-String -Path $file -Pattern "filtered due to score" | ForEach-Object { -split $_.Line | Select-Object -Index 8 } diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 159fd826..2ed462b7 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ -e "$1" ]; then +if [ -e "$1" ] && [ -f "$1" ]; then file="$1" else echo "CANNOT FIND LOG FILE" diff --git a/scripts/print_summary.ps1 b/scripts/print_summary.ps1 index 1428a862..aa9e9b84 100644 --- a/scripts/print_summary.ps1 +++ b/scripts/print_summary.ps1 @@ -1,17 +1,9 @@ -if (Test-Path -Path $args[0] -PathType Leaf) { - $file=$args[0] -} -else { +if (($args[0] -eq $null) -or -Not (Test-Path -Path $args[0] -PathType Leaf)) { Write-Host "CANNOT FIND LOG FILE" Exit 1 } - -if ($null -ne $args[1]) { - $output=$args[1] - Write-Host "Outputting IDs to $output" -} -else { - $output="./successful.txt" +elseif (Test-Path -Path $args[0] -PathType Leaf) { + $file=$args[0] } Write-Host -NoNewline "Downloaded submissions: " diff --git a/scripts/print_summary.sh b/scripts/print_summary.sh index ec547b7f..be787f63 100755 --- a/scripts/print_summary.sh +++ b/scripts/print_summary.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ -e "$1" ]; then +if [ -e "$1" ] && [ -f "$1" ]; then file="$1" else echo "CANNOT FIND LOG FILE" diff --git a/scripts/tests/example_logfiles/succeed_score_filter.txt b/scripts/tests/example_logfiles/succeed_score_filter.txt index 6430a34b..493da85e 100644 --- a/scripts/tests/example_logfiles/succeed_score_filter.txt +++ b/scripts/tests/example_logfiles/succeed_score_filter.txt @@ -1,2 +1,2 @@ [2022-07-23 14:04:14,095 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 15 < [50] -[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 16 > [1] +[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyz27 filtered due to score 16 > [1] diff --git a/scripts/tests/extract_failed_ids.Tests.ps1 b/scripts/tests/extract_failed_ids.Tests.ps1 new file mode 100644 index 00000000..20af9509 --- /dev/null +++ b/scripts/tests/extract_failed_ids.Tests.ps1 @@ -0,0 +1,39 @@ +Describe "extract_failed_ids" { + It "fail run no args" { + (..\extract_failed_ids.ps1) | Should -Be "CANNOT FIND LOG FILE" + } + + It "fail run no logfile" { + (..\extract_failed_ids.ps1 missing.txt) | Should -Be "CANNOT FIND LOG FILE" + } + + It "fail no downloader module" { + $down_error = (..\extract_failed_ids.ps1 example_logfiles\failed_no_downloader.txt) + $down_error | Should -HaveCount 3 + $down_error | Should -Contain "nxv3ea" + } + + It "fail resource error" { + $res_error = (..\extract_failed_ids.ps1 example_logfiles\failed_resource_error.txt) + $res_error | Should -HaveCount 1 + $res_error | Should -Contain "nxv3dt" + } + + It "fail site downloader error" { + $site_error = (..\extract_failed_ids.ps1 example_logfiles\failed_sitedownloader_error.txt) + $site_error | Should -HaveCount 2 + $site_error | Should -Contain "nxpn0h" + } + + It "fail failed file write" { + $write_error = (..\extract_failed_ids.ps1 example_logfiles\failed_write_error.txt) + $write_error | Should -HaveCount 1 + $write_error | Should -Contain "nnboza" + } + + It "fail disabled module" { + $disabled = (..\extract_failed_ids.ps1 example_logfiles\failed_disabled_module.txt) + $disabled | Should -HaveCount 1 + $disabled | Should -Contain "m2601g" + } +} diff --git a/scripts/tests/extract_successful_ids.Tests.ps1 b/scripts/tests/extract_successful_ids.Tests.ps1 new file mode 100644 index 00000000..8c9e30a8 --- /dev/null +++ b/scripts/tests/extract_successful_ids.Tests.ps1 @@ -0,0 +1,45 @@ +Describe "extract_successful_ids" { + It "fail run no args" { + (..\extract_successful_ids.ps1) | Should -Be "CANNOT FIND LOG FILE" + } + + It "fail run no logfile" { + (..\extract_successful_ids.ps1 missing.txt) | Should -Be "CANNOT FIND LOG FILE" + } + + It "success downloaded submission" { + $down_success = (..\extract_successful_ids.ps1 example_logfiles\succeed_downloaded_submission.txt) + $down_success | Should -HaveCount 7 + $down_success | Should -Contain "nn9cor" + } + + It "success resource hash" { + $hash_success = (..\extract_successful_ids.ps1 example_logfiles\succeed_resource_hash.txt) + $hash_success | Should -HaveCount 1 + $hash_success | Should -Contain "n86jk8" + } + + It "success download filter" { + $filt_success = (..\extract_successful_ids.ps1 example_logfiles\succeed_download_filter.txt) + $filt_success | Should -HaveCount 3 + $filt_success | Should -Contain "nxuxjy" + } + + It "success already exists" { + $exist_success = (..\extract_successful_ids.ps1 example_logfiles\succeed_already_exists.txt) + $exist_success | Should -HaveCount 3 + $exist_success | Should -Contain "nxrq9g" + } + + It "success hard link" { + $link_success = (..\extract_successful_ids.ps1 example_logfiles\succeed_hard_link.txt) + $link_success | Should -HaveCount 1 + $link_success | Should -Contain "nwnp2n" + } + + It "success score filter" { + $score_success = (..\extract_successful_ids.ps1 example_logfiles\succeed_score_filter.txt) + $score_success | Should -HaveCount 2 + $score_success | Should -Contain "ljyz27" + } +} diff --git a/scripts/tests/test_extract_failed_ids.bats b/scripts/tests/test_extract_failed_ids.bats index 04eada68..b1d26ea2 100644 --- a/scripts/tests/test_extract_failed_ids.bats +++ b/scripts/tests/test_extract_failed_ids.bats @@ -7,11 +7,16 @@ teardown() { rm -f failed.txt } -@test "fail run no logfile" { +@test "fail run no args" { run ../extract_failed_ids.sh assert_failure } +@test "fail run no logfile" { + run ../extract_failed_ids.sh ./missing.txt + assert_failure +} + @test "fail no downloader module" { run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt echo "$output" > failed.txt diff --git a/scripts/tests/test_extract_successful_ids.bats b/scripts/tests/test_extract_successful_ids.bats index 6ff54bcb..fda656b8 100644 --- a/scripts/tests/test_extract_successful_ids.bats +++ b/scripts/tests/test_extract_successful_ids.bats @@ -7,6 +7,16 @@ teardown() { rm -f successful.txt } +@test "fail run no args" { + run ../extract_successful_ids.sh + assert_failure +} + +@test "fail run no logfile" { + run ../extract_successful_ids.sh ./missing.txt + assert_failure +} + @test "success downloaded submission" { run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt echo "$output" > successful.txt From a5b445945a4a470f5d58a1c66e3f0fe84b858a75 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 24 Feb 2023 15:15:00 -0500 Subject: [PATCH 27/82] Partial revert of 98aa3d7 Reverts some quote changes for awk commands as the prints do not function the same with double quotes when the variable number is over 10 --- scripts/extract_failed_ids.sh | 10 +++++----- scripts/extract_successful_ids.sh | 12 ++++++------ scripts/print_summary.sh | 14 +++++++------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/scripts/extract_failed_ids.sh b/scripts/extract_failed_ids.sh index d7629006..9da78d4b 100755 --- a/scripts/extract_failed_ids.sh +++ b/scripts/extract_failed_ids.sh @@ -8,9 +8,9 @@ else fi { - grep "Could not download submission" "$file" | awk "{ print $12 }" | rev | cut -c 2- | rev ; - grep "Failed to download resource" "$file" | awk "{ print $15 }" ; - grep "failed to download submission" "$file" | awk "{ print $14 }" | rev | cut -c 2- | rev ; - grep "Failed to write file" "$file" | awk "{ print $14 }" ; - grep "skipped due to disabled module" "$file" | awk "{ print $9 }" ; + grep "Could not download submission" "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ; + grep "Failed to download resource" "$file" | awk '{ print $15 }' ; + grep "failed to download submission" "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ; + grep "Failed to write file" "$file" | awk '{ print $14 }' ; + grep "skipped due to disabled module" "$file" | awk '{ print $9 }' ; } diff --git a/scripts/extract_successful_ids.sh b/scripts/extract_successful_ids.sh index 2ed462b7..775a7cda 100755 --- a/scripts/extract_successful_ids.sh +++ b/scripts/extract_successful_ids.sh @@ -8,10 +8,10 @@ else fi { - grep "Downloaded submission" "$file" | awk "{ print $(NF-2) }" ; - grep "Resource hash" "$file" | awk "{ print $(NF-2) }" ; - grep "Download filter" "$file" | awk "{ print $(NF-3) }" ; - grep "already exists, continuing" "$file" | awk "{ print $(NF-3) }" ; - grep "Hard link made" "$file" | awk "{ print $(NF) }" ; - grep "filtered due to score" "$file" | awk "{ print $9 }" + grep "Downloaded submission" "$file" | awk '{ print $(NF-2) }' ; + grep "Resource hash" "$file" | awk '{ print $(NF-2) }' ; + grep "Download filter" "$file" | awk '{ print $(NF-3) }' ; + grep "already exists, continuing" "$file" | awk '{ print $(NF-3) }' ; + grep "Hard link made" "$file" | awk '{ print $(NF) }' ; + grep "filtered due to score" "$file" | awk '{ print $9 }' ; } diff --git a/scripts/print_summary.sh b/scripts/print_summary.sh index be787f63..92b47b09 100755 --- a/scripts/print_summary.sh +++ b/scripts/print_summary.sh @@ -7,10 +7,10 @@ else exit 1 fi -echo "Downloaded submissions: $( grep -c 'Downloaded submission' '$file' )" -echo "Failed downloads: $( grep -c 'failed to download submission' '$file' )" -echo "Files already downloaded: $( grep -c 'already exists, continuing' '$file' )" -echo "Hard linked submissions: $( grep -c 'Hard link made' '$file' )" -echo "Excluded submissions: $( grep -c 'in exclusion list' '$file' )" -echo "Files with existing hash skipped: $( grep -c 'downloaded elsewhere' '$file' )" -echo "Submissions from excluded subreddits: $( grep -c 'in skip list' '$file' )" +echo "Downloaded submissions: $( grep -c 'Downloaded submission' "$file" )" +echo "Failed downloads: $( grep -c 'failed to download submission' "$file" )" +echo "Files already downloaded: $( grep -c 'already exists, continuing' "$file" )" +echo "Hard linked submissions: $( grep -c 'Hard link made' "$file" )" +echo "Excluded submissions: $( grep -c 'in exclusion list' "$file" )" +echo "Files with existing hash skipped: $( grep -c 'downloaded elsewhere' "$file" )" +echo "Submissions from excluded subreddits: $( grep -c 'in skip list' "$file" )" From 905f54f5c06ca58fea93c1a49d5ccffeb3f4065a Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Fri, 24 Feb 2023 15:10:17 -0500 Subject: [PATCH 28/82] Add ruff Adds ruff settings, tests and pre-commit. --- .github/workflows/test.yml | 12 ++++++++---- .gitignore | 5 ++++- .pre-commit-config.yaml | 18 ++++++++---------- docs/CONTRIBUTING.md | 5 ++--- pyproject.toml | 16 ++++++++++++++-- 5 files changed, 36 insertions(+), 20 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 89e99612..80e21229 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,7 +37,7 @@ jobs: - name: Install dependencies run: | - python -m pip install --upgrade pip Flake8-pyproject pytest pytest-cov + python -m pip install --upgrade pip pytest pytest-cov ruff pip install . - name: Make configuration for tests @@ -46,16 +46,20 @@ jobs: run: | ./devscripts/configure${{ matrix.ext }} - - name: Lint with flake8 + - name: Critical ruff lint run: | - flake8 . --select=E9,F63,F7,F82 + ruff check --format=github --select=E9,F63,F7,F82 . - name: Test with pytest run: | - pytest -m 'not slow' --verbose --cov=./bdfr/ --cov-report term:skip-covered --cov-report html + pytest -m "not slow" --verbose --cov=./bdfr/ --cov-report term:skip-covered --cov-report html - name: Upload coverage report uses: actions/upload-artifact@v3 with: name: coverage_report path: htmlcov/ + + - name: Full ruff lint + run: | + ruff check --format=github . --exit-zero diff --git a/.gitignore b/.gitignore index 3918aa57..46cdf0f3 100644 --- a/.gitignore +++ b/.gitignore @@ -128,6 +128,9 @@ venv.bak/ .dmypy.json dmypy.json +# ruff +.ruff_cache/ + # Pyre type checker .pyre/ @@ -141,4 +144,4 @@ cython_debug/ test_config.cfg .vscode/ -.idea/ \ No newline at end of file +.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0537e57a..1bf956c5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,31 +6,29 @@ repos: rev: v0.12.1 hooks: - id: validate-pyproject + name: validate-pyproject - repo: https://github.com/psf/black rev: 23.1.0 hooks: - id: black + name: black - - repo: https://github.com/pycqa/isort - rev: 5.12.0 + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.0.254 hooks: - - id: isort - name: isort (python) - - - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 - hooks: - - id: flake8 - additional_dependencies: [Flake8-pyproject] + - id: ruff + name: ruff - repo: https://github.com/markdownlint/markdownlint rev: v0.12.0 hooks: - id: markdownlint + name: markdownlint - repo: https://github.com/adamchainz/blacken-docs rev: 1.13.0 hooks: - id: blacken-docs + name: blacken-docs additional_dependencies: [black>=23.1.0] diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 841204d9..11688631 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -73,13 +73,12 @@ python3 -m pip install -e .[dev] The BDFR project uses several tools to manage the code of the project. These include: - [black](https://github.com/psf/black) -- [flake8](https://github.com/john-hen/Flake8-pyproject) -- [isort](https://github.com/PyCQA/isort) - [markdownlint (mdl)](https://github.com/markdownlint/markdownlint) +- [ruff](https://github.com/charliermarsh/ruff) - [tox](https://tox.wiki/en/latest/) - [pre-commit](https://github.com/pre-commit/pre-commit) -The first four tools are formatters. These change the code to the standards expected for the BDFR project. The configuration details for these tools are contained in the [pyproject.toml](../pyproject.toml) file for the project. +The first three tools are formatters. These change the code to the standards expected for the BDFR project. The configuration details for these tools are contained in the [pyproject.toml](../pyproject.toml) file for the project. The tool `tox` is used to run tests and tools on demand and has the following environments: diff --git a/pyproject.toml b/pyproject.toml index d0916997..c4d5f08f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,10 +43,9 @@ data-files = {"config" = ["bdfr/default_config.cfg",]} [project.optional-dependencies] dev = [ "black>=23.1.0", - "Flake8-pyproject>=1.2.2", - "isort>=5.12.0", "pre-commit>=3.0.4", "pytest>=7.2.1", + "ruff>=0.0.254", "tox>=3.27.1", ] @@ -87,3 +86,16 @@ markers = [ "slow: test is slow to run", "authenticated: test requires an authenticated Reddit instance", ] + +[tool.ruff] +exclude = ["scripts/tests"] +flake8-annotations = {"allow-star-arg-any" = true, "suppress-dummy-args" = true} +flake8-pytest-style = {"parametrize-values-type" = "tuple", "mark-parentheses" = false} +format = "grouped" +ignore = ["ANN101","B904","N818","RET505"] +line-length = 120 +per-file-ignores={"tests/*"=["ANN","S101","S105","S106"], "scripts/*"=["INP","S105","S106"]} +select = ["ANN","B","BLE","E","ERA","F","I","ICN","INP","ISC","N","PT","PTH","Q","RUF","S","TID","UP","W","YTT"] +show-fixes = true +show-source = true +target-version = "py39" From 1705884dce6aedfe3f83f22549a5d9b2fc820520 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 4 Mar 2023 11:45:20 -0500 Subject: [PATCH 29/82] Use proper user agent string --- bdfr/connector.py | 9 ++++++--- bdfr/oauth2.py | 8 ++++---- tests/test_oauth2.py | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index a6ba23fa..b31b59ea 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -5,9 +5,9 @@ import itertools import logging import logging.handlers +import platform import re import shutil -import socket from abc import ABCMeta, abstractmethod from collections.abc import Callable, Iterable, Iterator from datetime import datetime @@ -21,6 +21,7 @@ import praw.models import prawcore +from bdfr import __version__ from bdfr import exceptions as errors from bdfr.configuration import Configuration from bdfr.download_filter import DownloadFilter @@ -75,6 +76,7 @@ def _setup_internal_objects(self): self.file_name_formatter = self.create_file_name_formatter() logger.log(9, "Create file name formatter") + self.user_agent = praw.const.USER_AGENT_FORMAT.format(":".join([platform.uname()[0], __package__, __version__])) self.create_reddit_instance() self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user])) @@ -138,6 +140,7 @@ def create_reddit_instance(self): scopes, self.cfg_parser.get("DEFAULT", "client_id"), self.cfg_parser.get("DEFAULT", "client_secret"), + user_agent=self.user_agent, ) token = oauth2_authenticator.retrieve_new_token() self.cfg_parser["DEFAULT"]["user_token"] = token @@ -149,7 +152,7 @@ def create_reddit_instance(self): self.reddit_instance = praw.Reddit( client_id=self.cfg_parser.get("DEFAULT", "client_id"), client_secret=self.cfg_parser.get("DEFAULT", "client_secret"), - user_agent=socket.gethostname(), + user_agent=self.user_agent, token_manager=token_manager, ) else: @@ -158,7 +161,7 @@ def create_reddit_instance(self): self.reddit_instance = praw.Reddit( client_id=self.cfg_parser.get("DEFAULT", "client_id"), client_secret=self.cfg_parser.get("DEFAULT", "client_secret"), - user_agent=socket.gethostname(), + user_agent=self.user_agent, ) def retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index be96f2c0..5bbce3f5 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -16,18 +16,18 @@ class OAuth2Authenticator: - def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str): - self._check_scopes(wanted_scopes) + def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str, user_agent: str): + self._check_scopes(wanted_scopes, user_agent) self.scopes = wanted_scopes self.client_id = client_id self.client_secret = client_secret @staticmethod - def _check_scopes(wanted_scopes: set[str]): + def _check_scopes(wanted_scopes: set[str], user_agent: str): try: response = requests.get( "https://www.reddit.com/api/v1/scopes.json", - headers={"User-Agent": "fetch-scopes test"}, + headers={"User-Agent": user_agent}, timeout=10, ) except TimeoutError: diff --git a/tests/test_oauth2.py b/tests/test_oauth2.py index 14b5cb0c..ad3b1d24 100644 --- a/tests/test_oauth2.py +++ b/tests/test_oauth2.py @@ -33,7 +33,7 @@ def example_config() -> configparser.ConfigParser: ), ) def test_check_scopes(test_scopes: set[str]): - OAuth2Authenticator._check_scopes(test_scopes) + OAuth2Authenticator._check_scopes(test_scopes, "fetch-scopes test") @pytest.mark.parametrize( @@ -67,7 +67,7 @@ def test_split_scopes(test_scopes: str, expected: set[str]): ) def test_check_scopes_bad(test_scopes: set[str]): with pytest.raises(BulkDownloaderException): - OAuth2Authenticator._check_scopes(test_scopes) + OAuth2Authenticator._check_scopes(test_scopes, "fetch-scopes test") def test_token_manager_read(example_config: configparser.ConfigParser): From c09d945c0d5e37df1810c72f7d107157bbe6b5d2 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sun, 26 Feb 2023 22:09:29 -0500 Subject: [PATCH 30/82] Fix test config --- devscripts/configure.ps1 | 4 ++-- devscripts/configure.sh | 6 +++--- tests/integration_tests/test_download_integration.py | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/devscripts/configure.ps1 b/devscripts/configure.ps1 index f5a21521..019686da 100644 --- a/devscripts/configure.ps1 +++ b/devscripts/configure.ps1 @@ -1,5 +1,5 @@ if (-not ([string]::IsNullOrEmpty($env:REDDIT_TOKEN))) { - Copy-Item .\\bdfr\\default_config.cfg .\\test_config.cfg - Write-Output "`nuser_token = $env:REDDIT_TOKEN" >> ./test_config.cfg + Copy-Item .\\bdfr\\default_config.cfg .\\tests\\test_config.cfg + Write-Output "`nuser_token = $env:REDDIT_TOKEN" >> ./tests/test_config.cfg } diff --git a/devscripts/configure.sh b/devscripts/configure.sh index f4528b15..e9e93c80 100755 --- a/devscripts/configure.sh +++ b/devscripts/configure.sh @@ -2,6 +2,6 @@ if [ -n "$REDDIT_TOKEN" ] then - cp ./bdfr/default_config.cfg ./test_config.cfg - echo -e "\nuser_token = $REDDIT_TOKEN" >> ./test_config.cfg -fi \ No newline at end of file + cp ./bdfr/default_config.cfg ./tests/test_config.cfg + echo -e "\nuser_token = $REDDIT_TOKEN" >> ./tests/test_config.cfg +fi diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 711f9c08..2a22d7a7 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -2,6 +2,7 @@ import shutil from pathlib import Path +from sys import platform from unittest.mock import MagicMock, patch import prawcore @@ -425,6 +426,7 @@ def test_cli_download_user_reddit_server_error(test_args: list[str], response: i @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.skipif(platform == "darwin", reason="Test hangs on macos github") @pytest.mark.parametrize( "test_args", ( From caa4421c78ec11ce34fcc71ab6bb5b61b4d58126 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 19:38:17 -0500 Subject: [PATCH 31/82] ANN204 ANN204 fixes --- bdfr/archive_entry/base_archive_entry.py | 2 +- bdfr/archive_entry/comment_archive_entry.py | 2 +- bdfr/archive_entry/submission_archive_entry.py | 2 +- bdfr/archiver.py | 2 +- bdfr/cloner.py | 2 +- bdfr/completion.py | 2 +- bdfr/configuration.py | 2 +- bdfr/connector.py | 2 +- bdfr/download_filter.py | 2 +- bdfr/downloader.py | 2 +- bdfr/file_name_formatter.py | 2 +- bdfr/oauth2.py | 4 ++-- bdfr/resource.py | 4 +++- bdfr/site_authenticator.py | 2 +- bdfr/site_downloaders/base_downloader.py | 2 +- bdfr/site_downloaders/delay_for_reddit.py | 2 +- bdfr/site_downloaders/direct.py | 2 +- bdfr/site_downloaders/erome.py | 2 +- bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py | 2 +- bdfr/site_downloaders/gallery.py | 2 +- bdfr/site_downloaders/gfycat.py | 2 +- bdfr/site_downloaders/imgur.py | 2 +- bdfr/site_downloaders/pornhub.py | 2 +- bdfr/site_downloaders/redgifs.py | 2 +- bdfr/site_downloaders/self_post.py | 2 +- bdfr/site_downloaders/vidble.py | 2 +- bdfr/site_downloaders/vreddit.py | 2 +- bdfr/site_downloaders/youtube.py | 2 +- 28 files changed, 31 insertions(+), 29 deletions(-) diff --git a/bdfr/archive_entry/base_archive_entry.py b/bdfr/archive_entry/base_archive_entry.py index f48662cb..eb19f57d 100644 --- a/bdfr/archive_entry/base_archive_entry.py +++ b/bdfr/archive_entry/base_archive_entry.py @@ -7,7 +7,7 @@ class BaseArchiveEntry(ABC): - def __init__(self, source: Union[Comment, Submission]): + def __init__(self, source: Union[Comment, Submission]) -> None: self.source = source self.post_details: dict = {} diff --git a/bdfr/archive_entry/comment_archive_entry.py b/bdfr/archive_entry/comment_archive_entry.py index 3ee53475..eea5cdf1 100644 --- a/bdfr/archive_entry/comment_archive_entry.py +++ b/bdfr/archive_entry/comment_archive_entry.py @@ -10,7 +10,7 @@ class CommentArchiveEntry(BaseArchiveEntry): - def __init__(self, comment: praw.models.Comment): + def __init__(self, comment: praw.models.Comment) -> None: super().__init__(comment) def compile(self) -> dict: diff --git a/bdfr/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py index 2a3fac5b..c3d45204 100644 --- a/bdfr/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -10,7 +10,7 @@ class SubmissionArchiveEntry(BaseArchiveEntry): - def __init__(self, submission: praw.models.Submission): + def __init__(self, submission: praw.models.Submission) -> None: super().__init__(submission) def compile(self) -> dict: diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 7118ba65..72bc77d2 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -25,7 +25,7 @@ class Archiver(RedditConnector): - def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()): + def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()) -> None: super().__init__(args, logging_handlers) def download(self): diff --git a/bdfr/cloner.py b/bdfr/cloner.py index 758e5c89..aa5de4a7 100644 --- a/bdfr/cloner.py +++ b/bdfr/cloner.py @@ -14,7 +14,7 @@ class RedditCloner(RedditDownloader, Archiver): - def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()): + def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()) -> None: super().__init__(args, logging_handlers) def download(self): diff --git a/bdfr/completion.py b/bdfr/completion.py index d9f82261..9427efd8 100644 --- a/bdfr/completion.py +++ b/bdfr/completion.py @@ -8,7 +8,7 @@ class Completion: - def __init__(self, shell: str): + def __init__(self, shell: str) -> None: self.shell = shell self.env = environ.copy() self.share_dir = appdirs.user_data_dir() diff --git a/bdfr/configuration.py b/bdfr/configuration.py index b7e7a251..36dbfb64 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -12,7 +12,7 @@ class Configuration(Namespace): - def __init__(self): + def __init__(self) -> None: super().__init__() self.authenticate = False self.config = None diff --git a/bdfr/connector.py b/bdfr/connector.py index b31b59ea..086fe9da 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -51,7 +51,7 @@ class TimeType(Enum): class RedditConnector(metaclass=ABCMeta): - def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()): + def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()) -> None: self.args = args self.config_directories = appdirs.AppDirs("bdfr", "BDFR") self.determine_directories() diff --git a/bdfr/download_filter.py b/bdfr/download_filter.py index 518c6663..0e6f1c6b 100644 --- a/bdfr/download_filter.py +++ b/bdfr/download_filter.py @@ -9,7 +9,7 @@ class DownloadFilter: - def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None): + def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None) -> None: self.excluded_extensions = excluded_extensions self.excluded_domains = excluded_domains diff --git a/bdfr/downloader.py b/bdfr/downloader.py index bd242b3c..d9c58eb8 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -36,7 +36,7 @@ def _calc_hash(existing_file: Path): class RedditDownloader(RedditConnector): - def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()): + def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()) -> None: super().__init__(args, logging_handlers) if self.args.search_existing: self.master_hash_list = self.scan_existing_files(self.download_directory) diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 942d5b54..10c54b30 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -35,7 +35,7 @@ def __init__( directory_format_string: str, time_format_string: str, restriction_scheme: Optional[str] = None, - ): + ) -> None: if not self.validate_string(file_format_string): raise BulkDownloaderException(f"{file_format_string!r} is not a valid format string") self.file_format_string = file_format_string diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 5bbce3f5..a305cd42 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -16,7 +16,7 @@ class OAuth2Authenticator: - def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str, user_agent: str): + def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str, user_agent: str) -> None: self._check_scopes(wanted_scopes, user_agent) self.scopes = wanted_scopes self.client_id = client_id @@ -92,7 +92,7 @@ def send_message(client: socket.socket, message: str = ""): class OAuth2TokenManager(praw.reddit.BaseTokenManager): - def __init__(self, config: configparser.ConfigParser, config_location: Path): + def __init__(self, config: configparser.ConfigParser, config_location: Path) -> None: super().__init__() self.config = config self.config_location = config_location diff --git a/bdfr/resource.py b/bdfr/resource.py index 012270e2..01dfb462 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -18,7 +18,9 @@ class Resource: - def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None): + def __init__( + self, source_submission: Submission, url: str, download_function: Callable, extension: str = None + ) -> None: self.source_submission = source_submission self.content: Optional[bytes] = None self.url = url diff --git a/bdfr/site_authenticator.py b/bdfr/site_authenticator.py index 5e177c93..764010bd 100644 --- a/bdfr/site_authenticator.py +++ b/bdfr/site_authenticator.py @@ -4,5 +4,5 @@ class SiteAuthenticator: - def __init__(self, cfg: configparser.ConfigParser): + def __init__(self, cfg: configparser.ConfigParser) -> None: self.imgur_authentication = None diff --git a/bdfr/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py index 8b4f892d..98d67073 100644 --- a/bdfr/site_downloaders/base_downloader.py +++ b/bdfr/site_downloaders/base_downloader.py @@ -15,7 +15,7 @@ class BaseDownloader(ABC): - def __init__(self, post: Submission, typical_extension: Optional[str] = None): + def __init__(self, post: Submission, typical_extension: Optional[str] = None) -> None: self.post = post self.typical_extension = typical_extension diff --git a/bdfr/site_downloaders/delay_for_reddit.py b/bdfr/site_downloaders/delay_for_reddit.py index 33807316..64c58dc3 100644 --- a/bdfr/site_downloaders/delay_for_reddit.py +++ b/bdfr/site_downloaders/delay_for_reddit.py @@ -13,7 +13,7 @@ class DelayForReddit(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/direct.py b/bdfr/site_downloaders/direct.py index 4a6ac92e..3f919d09 100644 --- a/bdfr/site_downloaders/direct.py +++ b/bdfr/site_downloaders/direct.py @@ -10,7 +10,7 @@ class Direct(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/erome.py b/bdfr/site_downloaders/erome.py index d1411602..5b7ad5e2 100644 --- a/bdfr/site_downloaders/erome.py +++ b/bdfr/site_downloaders/erome.py @@ -17,7 +17,7 @@ class Erome(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py index 86c2481d..40745992 100644 --- a/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py +++ b/bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py @@ -15,7 +15,7 @@ class YtdlpFallback(BaseFallbackDownloader, Youtube): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/gallery.py b/bdfr/site_downloaders/gallery.py index a2f65f4f..e091bfb0 100644 --- a/bdfr/site_downloaders/gallery.py +++ b/bdfr/site_downloaders/gallery.py @@ -14,7 +14,7 @@ class Gallery(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py index f5e7bad5..02a1e7e1 100644 --- a/bdfr/site_downloaders/gfycat.py +++ b/bdfr/site_downloaders/gfycat.py @@ -14,7 +14,7 @@ class Gfycat(Redgifs): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index cf15c3c0..d1b74a24 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -13,7 +13,7 @@ class Imgur(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) self.raw_data = {} diff --git a/bdfr/site_downloaders/pornhub.py b/bdfr/site_downloaders/pornhub.py index b12db8ee..e45fe1e6 100644 --- a/bdfr/site_downloaders/pornhub.py +++ b/bdfr/site_downloaders/pornhub.py @@ -14,7 +14,7 @@ class PornHub(Youtube): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index d8410a12..b1939481 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -14,7 +14,7 @@ class Redgifs(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/self_post.py b/bdfr/site_downloaders/self_post.py index 1b76b922..0104b761 100644 --- a/bdfr/site_downloaders/self_post.py +++ b/bdfr/site_downloaders/self_post.py @@ -13,7 +13,7 @@ class SelfPost(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/vidble.py b/bdfr/site_downloaders/vidble.py index 9fdcecd6..ea4b3192 100644 --- a/bdfr/site_downloaders/vidble.py +++ b/bdfr/site_downloaders/vidble.py @@ -17,7 +17,7 @@ class Vidble(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/vreddit.py b/bdfr/site_downloaders/vreddit.py index 04cfed1d..4158e077 100644 --- a/bdfr/site_downloaders/vreddit.py +++ b/bdfr/site_downloaders/vreddit.py @@ -14,7 +14,7 @@ class VReddit(Youtube): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: diff --git a/bdfr/site_downloaders/youtube.py b/bdfr/site_downloaders/youtube.py index 71d7be00..f0e2b4a2 100644 --- a/bdfr/site_downloaders/youtube.py +++ b/bdfr/site_downloaders/youtube.py @@ -18,7 +18,7 @@ class Youtube(BaseDownloader): - def __init__(self, post: Submission): + def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: From a5f5df7ab3fb43f6e17e84488a821097805165d0 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 20:16:28 -0500 Subject: [PATCH 32/82] ANN201 ANN201 fixes --- bdfr/__main__.py | 12 ++++++------ bdfr/archiver.py | 4 ++-- bdfr/cloner.py | 2 +- bdfr/completion.py | 4 ++-- bdfr/configuration.py | 4 ++-- bdfr/connector.py | 14 +++++++------- bdfr/downloader.py | 2 +- bdfr/oauth2.py | 4 ++-- bdfr/resource.py | 4 ++-- 9 files changed, 25 insertions(+), 25 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 670f0a35..6b660473 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -100,7 +100,7 @@ def _check_version(context, _param, value): callback=_check_version, help="Check version and exit.", ) -def cli(): +def cli() -> None: """BDFR is used to download and archive content from Reddit.""" pass @@ -110,7 +110,7 @@ def cli(): @_add_options(_downloader_options) @click.help_option("-h", "--help") @click.pass_context -def cli_download(context: click.Context, **_): +def cli_download(context: click.Context, **_) -> None: """Used to download content posted to Reddit.""" config = Configuration() config.process_click_arguments(context) @@ -131,7 +131,7 @@ def cli_download(context: click.Context, **_): @_add_options(_archiver_options) @click.help_option("-h", "--help") @click.pass_context -def cli_archive(context: click.Context, **_): +def cli_archive(context: click.Context, **_) -> None: """Used to archive post data from Reddit.""" config = Configuration() config.process_click_arguments(context) @@ -153,7 +153,7 @@ def cli_archive(context: click.Context, **_): @_add_options(_downloader_options) @click.help_option("-h", "--help") @click.pass_context -def cli_clone(context: click.Context, **_): +def cli_clone(context: click.Context, **_) -> None: """Combines archive and download commands.""" config = Configuration() config.process_click_arguments(context) @@ -173,7 +173,7 @@ def cli_clone(context: click.Context, **_): @click.argument("shell", type=click.Choice(("all", "bash", "fish", "zsh"), case_sensitive=False), default="all") @click.help_option("-h", "--help") @click.option("-u", "--uninstall", is_flag=True, default=False, help="Uninstall completion") -def cli_completion(shell: str, uninstall: bool): +def cli_completion(shell: str, uninstall: bool) -> None: """\b Installs shell completions for BDFR. Options: all, bash, fish, zsh @@ -215,7 +215,7 @@ def filter(self, record: logging.LogRecord) -> bool: return stream -def silence_module_loggers(): +def silence_module_loggers() -> None: logging.getLogger("praw").setLevel(logging.CRITICAL) logging.getLogger("prawcore").setLevel(logging.CRITICAL) logging.getLogger("urllib3").setLevel(logging.CRITICAL) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 72bc77d2..9fd4404e 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -28,7 +28,7 @@ class Archiver(RedditConnector): def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()) -> None: super().__init__(args, logging_handlers) - def download(self): + def download(self) -> None: for generator in self.reddit_lists: try: for submission in generator: @@ -82,7 +82,7 @@ def _pull_lever_entry_factory(praw_item: Union[praw.models.Submission, praw.mode else: raise ArchiverError(f"Factory failed to classify item of type {type(praw_item).__name__}") - def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comment]): + def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comment]) -> None: if self.args.comment_context and isinstance(praw_item, praw.models.Comment): logger.debug(f"Converting comment {praw_item.id} to submission {praw_item.submission.id}") praw_item = praw_item.submission diff --git a/bdfr/cloner.py b/bdfr/cloner.py index aa5de4a7..18cbb34b 100644 --- a/bdfr/cloner.py +++ b/bdfr/cloner.py @@ -17,7 +17,7 @@ class RedditCloner(RedditDownloader, Archiver): def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()) -> None: super().__init__(args, logging_handlers) - def download(self): + def download(self) -> None: for generator in self.reddit_lists: try: for submission in generator: diff --git a/bdfr/completion.py b/bdfr/completion.py index 9427efd8..3d8ccb97 100644 --- a/bdfr/completion.py +++ b/bdfr/completion.py @@ -14,7 +14,7 @@ def __init__(self, shell: str) -> None: self.share_dir = appdirs.user_data_dir() self.entry_points = ["bdfr", "bdfr-archive", "bdfr-clone", "bdfr-download"] - def install(self): + def install(self) -> None: if self.shell in ("all", "bash"): comp_dir = self.share_dir + "/bash-completion/completions/" if not Path(comp_dir).exists(): @@ -46,7 +46,7 @@ def install(self): file.write(subprocess.run([point], env=self.env, capture_output=True, text=True).stdout) print(f"Zsh completion for {point} written to {comp_dir}_{point}") - def uninstall(self): + def uninstall(self) -> None: if self.shell in ("all", "bash"): comp_dir = self.share_dir + "/bash-completion/completions/" for point in self.entry_points: diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 36dbfb64..10b5e4a5 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -58,7 +58,7 @@ def __init__(self) -> None: self.format = "json" self.comment_context: bool = False - def process_click_arguments(self, context: click.Context): + def process_click_arguments(self, context: click.Context) -> None: if context.params.get("opts") is not None: self.parse_yaml_options(context.params["opts"]) for arg_key in context.params.keys(): @@ -71,7 +71,7 @@ def process_click_arguments(self, context: click.Context): continue setattr(self, arg_key, val) - def parse_yaml_options(self, file_path: str): + def parse_yaml_options(self, file_path: str) -> None: yaml_file_loc = Path(file_path) if not yaml_file_loc.exists(): logger.error(f"No YAML file found at {yaml_file_loc}") diff --git a/bdfr/connector.py b/bdfr/connector.py index 086fe9da..958967ff 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -100,7 +100,7 @@ def _apply_logging_handlers(handlers: Iterable[logging.Handler]): for handler in handlers: main_logger.addHandler(handler) - def read_config(self): + def read_config(self) -> None: """Read any cfg values that need to be processed""" if self.args.max_wait_time is None: self.args.max_wait_time = self.cfg_parser.getint("DEFAULT", "max_wait_time", fallback=120) @@ -122,14 +122,14 @@ def read_config(self): with Path(self.config_location).open(mode="w") as file: self.cfg_parser.write(file) - def parse_disabled_modules(self): + def parse_disabled_modules(self) -> None: disabled_modules = self.args.disable_module disabled_modules = self.split_args_input(disabled_modules) disabled_modules = {name.strip().lower() for name in disabled_modules} self.args.disable_module = disabled_modules logger.debug(f"Disabling the following modules: {', '.join(self.args.disable_module)}") - def create_reddit_instance(self): + def create_reddit_instance(self) -> None: if self.args.authenticate: logger.debug("Using authenticated Reddit instance") if not self.cfg_parser.has_option("DEFAULT", "user_token"): @@ -176,14 +176,14 @@ def retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: logger.log(9, "Retrieved submissions for given links") return master_list - def determine_directories(self): + def determine_directories(self) -> None: self.download_directory = Path(self.args.directory).resolve().expanduser() self.config_directory = Path(self.config_directories.user_config_dir) self.download_directory.mkdir(exist_ok=True, parents=True) self.config_directory.mkdir(exist_ok=True, parents=True) - def load_config(self): + def load_config(self) -> None: self.cfg_parser = configparser.ConfigParser() if self.args.config: if (cfg_path := Path(self.args.config)).exists(): @@ -393,7 +393,7 @@ def get_user_data(self) -> list[Iterator]: else: return [] - def check_user_existence(self, name: str): + def check_user_existence(self, name: str) -> None: user = self.reddit_instance.redditor(name=name) try: if user.id: @@ -428,7 +428,7 @@ def create_authenticator(self) -> SiteAuthenticator: return SiteAuthenticator(self.cfg_parser) @abstractmethod - def download(self): + def download(self) -> None: pass @staticmethod diff --git a/bdfr/downloader.py b/bdfr/downloader.py index d9c58eb8..f9267d2c 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -41,7 +41,7 @@ def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handl if self.args.search_existing: self.master_hash_list = self.scan_existing_files(self.download_directory) - def download(self): + def download(self) -> None: for generator in self.reddit_lists: try: for submission in generator: diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index a305cd42..3b7aca1f 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -97,7 +97,7 @@ def __init__(self, config: configparser.ConfigParser, config_location: Path) -> self.config = config self.config_location = config_location - def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer): + def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer) -> None: if authorizer.refresh_token is None: if self.config.has_option("DEFAULT", "user_token"): authorizer.refresh_token = self.config.get("DEFAULT", "user_token") @@ -105,7 +105,7 @@ def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer): else: raise RedditAuthenticationError("No auth token loaded in configuration") - def post_refresh_callback(self, authorizer: praw.reddit.Authorizer): + def post_refresh_callback(self, authorizer: praw.reddit.Authorizer) -> None: self.config.set("DEFAULT", "user_token", authorizer.refresh_token) with Path(self.config_location).open(mode="w") as file: self.config.write(file, True) diff --git a/bdfr/resource.py b/bdfr/resource.py index 01dfb462..375e6ce2 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -34,7 +34,7 @@ def __init__( def retry_download(url: str) -> Callable: return lambda global_params: Resource.http_download(url, global_params) - def download(self, download_parameters: Optional[dict] = None): + def download(self, download_parameters: Optional[dict] = None) -> None: if download_parameters is None: download_parameters = {} if not self.content: @@ -49,7 +49,7 @@ def download(self, download_parameters: Optional[dict] = None): if not self.hash and self.content: self.create_hash() - def create_hash(self): + def create_hash(self) -> None: self.hash = hashlib.md5(self.content, usedforsecurity=False) def _determine_extension(self) -> Optional[str]: From 0c4cfd8b440a60471cf65bfe8474573fe5c18cf8 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 20:32:48 -0500 Subject: [PATCH 33/82] ANN202 ANN202 fixes --- bdfr/__main__.py | 6 +++--- bdfr/archive_entry/submission_archive_entry.py | 2 +- bdfr/archiver.py | 8 ++++---- bdfr/connector.py | 2 +- bdfr/downloader.py | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 6b660473..f34125de 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -68,8 +68,8 @@ ] -def _add_options(opts: list): - def wrap(func): +def _add_options(opts: list): # noqa: ANN202 + def wrap(func): # noqa: ANN202 for opt in opts: func = opt(func) return func @@ -77,7 +77,7 @@ def wrap(func): return wrap -def _check_version(context, _param, value): +def _check_version(context, _param, value) -> None: if not value or context.resilient_parsing: return current = __version__ diff --git a/bdfr/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py index c3d45204..3a6b855e 100644 --- a/bdfr/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -20,7 +20,7 @@ def compile(self) -> dict: out["comments"] = comments return out - def _get_post_details(self): + def _get_post_details(self) -> None: self.post_details = { "title": self.source.title, "name": self.source.name, diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 9fd4404e..023d18e1 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -97,22 +97,22 @@ def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comme raise ArchiverError(f"Unknown format {self.args.format!r} given") logger.info(f"Record for entry item {praw_item.id} written to disk") - def _write_entry_json(self, entry: BaseArchiveEntry): + def _write_entry_json(self, entry: BaseArchiveEntry) -> None: resource = Resource(entry.source, "", lambda: None, ".json") content = json.dumps(entry.compile()) self._write_content_to_disk(resource, content) - def _write_entry_xml(self, entry: BaseArchiveEntry): + def _write_entry_xml(self, entry: BaseArchiveEntry) -> None: resource = Resource(entry.source, "", lambda: None, ".xml") content = dict2xml.dict2xml(entry.compile(), wrap="root") self._write_content_to_disk(resource, content) - def _write_entry_yaml(self, entry: BaseArchiveEntry): + def _write_entry_yaml(self, entry: BaseArchiveEntry) -> None: resource = Resource(entry.source, "", lambda: None, ".yaml") content = yaml.safe_dump(entry.compile()) self._write_content_to_disk(resource, content) - def _write_content_to_disk(self, resource: Resource, content: str): + def _write_content_to_disk(self, resource: Resource, content: str) -> None: file_path = self.file_name_formatter.format_path(resource, self.download_directory) file_path.parent.mkdir(exist_ok=True, parents=True) with Path(file_path).open(mode="w", encoding="utf-8") as file: diff --git a/bdfr/connector.py b/bdfr/connector.py index 958967ff..48de290e 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -64,7 +64,7 @@ def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handl self.reddit_lists = self.retrieve_reddit_lists() - def _setup_internal_objects(self): + def _setup_internal_objects(self) -> None: self.parse_disabled_modules() self.download_filter = self.create_download_filter() diff --git a/bdfr/downloader.py b/bdfr/downloader.py index f9267d2c..9250e276 100644 --- a/bdfr/downloader.py +++ b/bdfr/downloader.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) -def _calc_hash(existing_file: Path): +def _calc_hash(existing_file: Path) -> tuple[Path, str]: chunk_size = 1024 * 1024 md5_hash = hashlib.md5(usedforsecurity=False) with existing_file.open("rb") as file: @@ -54,7 +54,7 @@ def download(self) -> None: logger.debug("Waiting 60 seconds to continue") sleep(60) - def _download_submission(self, submission: praw.models.Submission): + def _download_submission(self, submission: praw.models.Submission) -> None: if submission.id in self.excluded_submission_ids: logger.debug(f"Object {submission.id} in exclusion list, skipping") return From c27fcb08d7342e41209b13075462db9b03e0f82f Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 20:47:12 -0500 Subject: [PATCH 34/82] ANN205 ANN205 fixes --- bdfr/connector.py | 7 ++++--- bdfr/oauth2.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 48de290e..5328cf67 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -95,7 +95,7 @@ def _setup_internal_objects(self) -> None: self.args.skip_subreddit = {sub.lower() for sub in self.args.skip_subreddit} @staticmethod - def _apply_logging_handlers(handlers: Iterable[logging.Handler]): + def _apply_logging_handlers(handlers: Iterable[logging.Handler]) -> None: main_logger = logging.getLogger() for handler in handlers: main_logger.addHandler(handler) @@ -432,11 +432,12 @@ def download(self) -> None: pass @staticmethod - def check_subreddit_status(subreddit: praw.models.Subreddit): + def check_subreddit_status(subreddit: praw.models.Subreddit) -> None: if subreddit.display_name in ("all", "friends"): return try: - assert subreddit.id + if subreddit.id: + return except prawcore.NotFound: raise errors.BulkDownloaderException(f"Source {subreddit.display_name} cannot be found") except prawcore.Redirect: diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index 3b7aca1f..e5f887f3 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -23,7 +23,7 @@ def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str, self.client_secret = client_secret @staticmethod - def _check_scopes(wanted_scopes: set[str], user_agent: str): + def _check_scopes(wanted_scopes: set[str], user_agent: str) -> None: try: response = requests.get( "https://www.reddit.com/api/v1/scopes.json", @@ -86,7 +86,7 @@ def receive_connection() -> socket.socket: return client @staticmethod - def send_message(client: socket.socket, message: str = ""): + def send_message(client: socket.socket, message: str = "") -> None: client.send(f"HTTP/1.1 200 OK\r\n\r\n{message}".encode()) client.close() From 3b69d8389245a2dec5eb4681e9feb8a954ec47e4 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 18 Feb 2023 22:10:35 -0500 Subject: [PATCH 35/82] ANN001 ANN001 fixes --- bdfr/__main__.py | 4 ++-- bdfr/connector.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index f34125de..bb85f8b8 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -69,7 +69,7 @@ def _add_options(opts: list): # noqa: ANN202 - def wrap(func): # noqa: ANN202 + def wrap(func): # noqa: ANN001,ANN202 for opt in opts: func = opt(func) return func @@ -77,7 +77,7 @@ def wrap(func): # noqa: ANN202 return wrap -def _check_version(context, _param, value) -> None: +def _check_version(context: click.core.Context, _param, value: bool) -> None: if not value or context.resilient_parsing: return current = __version__ diff --git a/bdfr/connector.py b/bdfr/connector.py index 5328cf67..d5136cfd 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -14,6 +14,7 @@ from enum import Enum, auto from pathlib import Path from time import sleep +from typing import Union import appdirs import praw @@ -349,7 +350,9 @@ def get_multireddits(self) -> list[Iterator]: else: return [] - def create_filtered_listing_generator(self, reddit_source) -> Iterator: + def create_filtered_listing_generator( + self, reddit_source: Union[praw.models.Subreddit, praw.models.Multireddit, praw.models.Redditor.submissions] + ) -> Iterator: sort_function = self.determine_sort_function() if self.sort_filter in (RedditTypes.SortType.TOP, RedditTypes.SortType.CONTROVERSIAL): return sort_function(reddit_source, limit=self.args.limit, time_filter=self.time_filter.value) From 7fbd001e8ad797804bd306362c1e66e9f2fa80c0 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 11 Mar 2023 10:24:14 -0500 Subject: [PATCH 36/82] Fix Youtube test tested video now private. Updated to new video that should stay up. --- tests/site_downloaders/test_youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/site_downloaders/test_youtube.py b/tests/site_downloaders/test_youtube.py index bf832bb2..d990dbdf 100644 --- a/tests/site_downloaders/test_youtube.py +++ b/tests/site_downloaders/test_youtube.py @@ -15,7 +15,7 @@ ("test_url", "expected_hash"), ( ("https://www.youtube.com/watch?v=uSm2VDgRIUs", "2d60b54582df5b95ec72bb00b580d2ff"), - ("https://www.youtube.com/watch?v=GcI7nxQj7HA", "5db0fc92a0a7fb9ac91e63505eea9cf0"), + ("https://www.youtube.com/watch?v=NcA_j23HuDU", "26e6ca4849267e600ff474f4260c3b5b"), ), ) def test_find_resources_good(test_url: str, expected_hash: str): From 87fd63d7586d1730b144550360e1ea2d21d6654f Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 23 Mar 2023 13:24:03 -0400 Subject: [PATCH 37/82] Test fixes --- .../fallback_downloaders/test_ytdlp_fallback.py | 2 +- tests/site_downloaders/test_erome.py | 2 +- tests/site_downloaders/test_gfycat.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py index 9823d081..f058abf5 100644 --- a/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py +++ b/tests/site_downloaders/fallback_downloaders/test_ytdlp_fallback.py @@ -14,7 +14,7 @@ ("test_url", "expected"), ( ("https://www.reddit.com/r/specializedtools/comments/n2nw5m/bamboo_splitter/", True), - ("https://www.youtube.com/watch?v=P19nvJOmqCc", True), + ("https://www.youtube.com/watch?v=DWUbA501CO4", True), ("https://www.example.com/test", False), ("https://milesmatrix.bandcamp.com/album/la-boum/", False), ("https://v.redd.it/dlr54z8p182a1", True), diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 7dd31813..a82ccd12 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -39,7 +39,7 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): ( ("https://www.erome.com/a/vqtPuLXh", 1), ("https://www.erome.com/a/4tP3KI6F", 1), - ("https://www.erome.com/a/WNyK674a", 41), + ("https://www.erome.com/a/FsRWKycz", 1), ), ) def test_download_resource(test_url: str, expected_hashes_len: int): diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py index 220c8071..0daaeb6f 100644 --- a/tests/site_downloaders/test_gfycat.py +++ b/tests/site_downloaders/test_gfycat.py @@ -21,7 +21,7 @@ def test_auth_cache(): ( ("https://gfycat.com/definitivecaninecrayfish", "https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4"), ("https://gfycat.com/dazzlingsilkyiguana", "https://giant.gfycat.com/DazzlingSilkyIguana.mp4"), - ("https://gfycat.com/WearyComposedHairstreak", "https://thumbs44.redgifs.com/WearyComposedHairstreak.mp4"), + ("https://gfycat.com/ComposedWholeBullfrog", "https://thumbs44.redgifs.com/ComposedWholeBullfrog.mp4"), ( "https://thumbs.gfycat.com/ComposedWholeBullfrog-size_restricted.gif", "https://thumbs44.redgifs.com/ComposedWholeBullfrog.mp4", @@ -43,7 +43,7 @@ def test_get_link(test_url: str, expected_url: str): ( ("https://gfycat.com/definitivecaninecrayfish", "48f9bd4dbec1556d7838885612b13b39"), ("https://gfycat.com/dazzlingsilkyiguana", "808941b48fc1e28713d36dd7ed9dc648"), - ("https://gfycat.com/WearyComposedHairstreak", "5f82ba1ba23cc927c9fbb0c0421953a5"), + ("https://gfycat.com/ComposedWholeBullfrog", "5292343665a13b5369d889d911ae284d"), ("https://thumbs.gfycat.com/ComposedWholeBullfrog-size_restricted.gif", "5292343665a13b5369d889d911ae284d"), ("https://giant.gfycat.com/ComposedWholeBullfrog.mp4", "5292343665a13b5369d889d911ae284d"), ), From b90ab1b02a80883c41b7b369890793eb5c2b9a68 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 2 Mar 2023 22:20:03 -0500 Subject: [PATCH 38/82] Update docs --- scripts/README.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 70df8d16..39e9dfc2 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -2,11 +2,11 @@ Due to the verboseness of the logs, a great deal of information can be gathered quite easily from the BDFR's logfiles. In this folder, there is a selection of scripts that parse these logs, scraping useful bits of information. Since the logfiles are recurring patterns of strings, it is a fairly simple matter to write scripts that utilise tools included on most Linux systems. - - [Script to extract all successfully downloaded IDs](#extract-all-successfully-downloaded-ids) - - [Script to extract all failed download IDs](#extract-all-failed-ids) - - [Timestamp conversion](#converting-bdfrv1-timestamps-to-bdfrv2-timestamps) - - [Printing summary statistics for a run](#printing-summary-statistics) - - [Unsaving posts from your account after downloading](#unsave-posts-after-downloading) +- [Script to extract all successfully downloaded IDs](#extract-all-successfully-downloaded-ids) +- [Script to extract all failed download IDs](#extract-all-failed-ids) +- [Timestamp conversion](#converting-bdfrv1-timestamps-to-bdfrv2-timestamps) +- [Printing summary statistics for a run](#printing-summary-statistics) +- [Unsaving posts from your account after downloading](#unsave-posts-after-downloading) ## Extract all Successfully Downloaded IDs @@ -15,7 +15,7 @@ This script is contained [here](extract_successful_ids.sh) and will result in a The script can be used with the following signature: ```bash -./extract_successful_ids.sh LOGFILE_LOCATION +./extract_successful_ids.sh LOGFILE_LOCATION >> ``` By default, if the second argument is not supplied, the script will write the results to `successful.txt`. @@ -32,7 +32,7 @@ An example of the script being run on a Linux machine is the following: The script can be used with the following signature: ```bash -./extract_failed_ids.sh LOGFILE_LOCATION +./extract_failed_ids.sh LOGFILE_LOCATION >> ``` By default, if the second argument is not supplied, the script will write the results to `failed.txt`. @@ -72,19 +72,20 @@ Submissions from excluded subreddits: 0 ## Unsave Posts After Downloading [This script](unsaveposts.py) takes a list of submission IDs from a file named `successfulids` created with the `extract_successful_ids.sh` script and unsaves them from your account. To make it work you will need to make a user script in your reddit profile like this: - - Fill in the username and password fields in the script. Make sure you keep the quotes around the fields. - - Go to https://old.reddit.com/prefs/apps/ - - Click on `Develop an app` at the bottom. - - Make sure you select a `script` not a `web app`. - - Name it `Unsave Posts`. - - Fill in the `Redirect URI` field with `127.0.0.0`. - - Save it. - - Fill in the `client_id` and `client_secret` fields on the script. The client ID is the 14 character string under the name you gave your script. .It'll look like a bunch of random characters like this: pspYLwDoci9z_A. The client secret is the longer string next to "secret". Again keep the quotes around the fields. + +- Fill in the username and password fields in the script. Make sure you keep the quotes around the fields. +- Go to +- Click on `Develop an app` at the bottom. +- Make sure you select a `script` not a `web app`. +- Name it `Unsave Posts`. +- Fill in the `Redirect URI` field with `127.0.0.0`. +- Save it. +- Fill in the `client_id` and `client_secret` fields on the script. The client ID is the 14 character string under the name you gave your script. .It'll look like a bunch of random characters like this: pspYLwDoci9z_A. The client secret is the longer string next to "secret". Again keep the quotes around the fields. Now the script is ready tu run. Just execute it like this: ```bash -python3.9 -m bdfr download DOWNLOAD_DIR --authenticate --user me --saved --log LOGFILE_LOCATION +bdfr download DOWNLOAD_DIR --authenticate --user me --saved --log LOGFILE_LOCATION ./extract_successful_ids.sh LOGFILE_LOCATION > successfulids ./unsaveposts.py ``` From 38bef1d1e0ac59d40b83c922e0471833311fab19 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 27 Apr 2023 13:52:20 -0400 Subject: [PATCH 39/82] Add Catbox downloader Adds downloader for catbox.moe collections. --- bdfr/site_downloaders/catbox.py | 39 +++++++++++++++ bdfr/site_downloaders/download_factory.py | 3 ++ tests/site_downloaders/test_catbox.py | 59 +++++++++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 bdfr/site_downloaders/catbox.py create mode 100644 tests/site_downloaders/test_catbox.py diff --git a/bdfr/site_downloaders/catbox.py b/bdfr/site_downloaders/catbox.py new file mode 100644 index 00000000..09a88c55 --- /dev/null +++ b/bdfr/site_downloaders/catbox.py @@ -0,0 +1,39 @@ +import logging +from itertools import chain +from typing import Optional + +import bs4 +from praw.models import Submission + +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader + +logger = logging.getLogger(__name__) + + +class Catbox(BaseDownloader): + def __init__(self, post: Submission) -> None: + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + links = self.get_links(self.post.url) + if not links: + raise SiteDownloaderError("Catbox parser could not find any links") + links = [Resource(self.post, link, Resource.retry_download(link)) for link in links] + return links + + @staticmethod + def get_links(url: str) -> set[str]: + content = Catbox.retrieve_url(url) + soup = bs4.BeautifulSoup(content.text, "html.parser") + collection_div = soup.find("div", attrs={"class": "imagecontainer"}) + images = collection_div.find_all("a") + images = [link.get("href") for link in images] + videos = collection_div.find_all("video") + videos = [link.get("src") for link in videos] + audios = collection_div.find_all("audio") + audios = [link.get("src") for link in audios] + resources = chain(images, videos, audios) + return set(resources) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index d4fd83a7..b3485617 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -5,6 +5,7 @@ from bdfr.exceptions import NotADownloadableLinkError from bdfr.site_downloaders.base_downloader import BaseDownloader +from bdfr.site_downloaders.catbox import Catbox from bdfr.site_downloaders.delay_for_reddit import DelayForReddit from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.erome import Erome @@ -36,6 +37,8 @@ def pull_lever(url: str) -> type[BaseDownloader]: return Direct elif re.match(r"erome\.com.*", sanitised_url): return Erome + elif re.match(r"catbox\.moe", sanitised_url): + return Catbox elif re.match(r"delayforreddit\.com", sanitised_url): return DelayForReddit elif re.match(r"reddit\.com/gallery/.*", sanitised_url): diff --git a/tests/site_downloaders/test_catbox.py b/tests/site_downloaders/test_catbox.py new file mode 100644 index 00000000..e6dc2478 --- /dev/null +++ b/tests/site_downloaders/test_catbox.py @@ -0,0 +1,59 @@ +from unittest.mock import Mock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.catbox import Catbox + + +@pytest.mark.online +@pytest.mark.parametrize( + ("test_url", "expected"), + ( + ( + "https://catbox.moe/c/vel5eg", + { + "https://files.catbox.moe/h2dx9k.gif", + "https://files.catbox.moe/bc83lg.png", + "https://files.catbox.moe/aq3m2a.jpeg", + "https://files.catbox.moe/yfk8r7.jpeg", + "https://files.catbox.moe/34ofbz.png", + "https://files.catbox.moe/xx4lcw.mp4", + "https://files.catbox.moe/xocd6t.mp3", + }, + ), + ), +) +def test_get_links(test_url: str, expected: set[str]): + results = Catbox.get_links(test_url) + assert results == expected + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize( + ("test_url", "expected_hashes"), + ( + ( + "https://catbox.moe/c/vel5eg", + { + "014762b38e280ef3c0d000cc5f2aa386", + "85799edf12e20876f37286784460ad1b", + "c71b88c4230aa3aaad52a644fb709737", + "f40cffededd1929726d9cd265cc42c67", + "bda1f646c49607183c2450441f2ea6e8", + "21b48729bf9be7884999442b73887eed", + "0ec327259733a8276c207cc6e1b001ad", + }, + ), + ), +) +def test_download_resources(test_url: str, expected_hashes: set[str]): + mock_download = Mock() + mock_download.url = test_url + downloader = Catbox(mock_download) + results = downloader.find_resources() + assert all(isinstance(res, Resource) for res in results) + [res.download() for res in results] + hashes = {res.hash.hexdigest() for res in results} + assert hashes == set(expected_hashes) From 8d6101112b5e98b2834ea4b9f9c5f89c26bcd710 Mon Sep 17 00:00:00 2001 From: Soulsuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Sat, 29 Apr 2023 14:08:08 -0400 Subject: [PATCH 40/82] Redgif coverage better coverage for thumbs subdomains and direct links to images. --- bdfr/site_downloaders/download_factory.py | 2 +- bdfr/site_downloaders/redgifs.py | 3 +-- tests/site_downloaders/test_redgifs.py | 2 ++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index b3485617..c3b51a81 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -27,7 +27,7 @@ def pull_lever(url: str) -> type[BaseDownloader]: sanitised_url = DownloadFactory.sanitise_url(url).lower() if re.match(r"(i\.|m\.|o\.)?imgur", sanitised_url): return Imgur - elif re.match(r"(i\.|thumbs\d\.|v\d\.)?(redgifs|gifdeliverynetwork)", sanitised_url): + elif re.match(r"(i\.|thumbs\d{1,2}\.|v\d\.)?(redgifs|gifdeliverynetwork)", sanitised_url): return Redgifs elif re.match(r"(thumbs\.|giant\.)?gfycat\.", sanitised_url): return Gfycat diff --git a/bdfr/site_downloaders/redgifs.py b/bdfr/site_downloaders/redgifs.py index d8410a12..272fd902 100644 --- a/bdfr/site_downloaders/redgifs.py +++ b/bdfr/site_downloaders/redgifs.py @@ -33,8 +33,7 @@ def _get_id(url: str) -> str: if url.endswith("/"): url = url.removesuffix("/") redgif_id = re.match(r".*/(.*?)(?:#.*|\?.*|\..{0,})?$", url).group(1).lower() - if redgif_id.endswith("-mobile"): - redgif_id = redgif_id.removesuffix("-mobile") + redgif_id = re.sub(r"(-.*)$", "", redgif_id) except AttributeError: raise SiteDownloaderError(f"Could not extract Redgifs ID from {url}") return redgif_id diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 038e8f0a..86478f05 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -25,6 +25,7 @@ def test_auth_cache(): ("https://thumbs4.redgifs.com/DismalIgnorantDrongo.mp4", "dismalignorantdrongo"), ("https://thumbs4.redgifs.com/DismalIgnorantDrongo-mobile.mp4", "dismalignorantdrongo"), ("https://v3.redgifs.com/watch/newilliteratemeerkat#rel=user%3Atastynova", "newilliteratemeerkat"), + ("https://thumbs46.redgifs.com/BabyishCharmingAidi-medium.jpg", "babyishcharmingaidi"), ), ) def test_get_id(test_url: str, expected: str): @@ -81,6 +82,7 @@ def test_get_link(test_url: str, expected: set[str]): "44fb28f72ec9a5cca63fa4369ab4f672", }, ), + ("https://thumbs46.redgifs.com/BabyishCharmingAidi-medium.jpg", {"bf14b9f3d5b630cb5fd271661226f1af"}), ), ) def test_download_resource(test_url: str, expected_hashes: set[str]): From a05ece3013190d02f272da5159313e17dcf1fb4d Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 4 May 2023 21:50:55 -0400 Subject: [PATCH 41/82] Add option for downvoted posts --- README.md | 11 +++++++---- bdfr/__main__.py | 9 +++++---- bdfr/configuration.py | 1 + bdfr/connector.py | 7 +++++-- tests/integration_tests/test_download_integration.py | 1 + tests/test_connector.py | 1 + 6 files changed, 20 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 47284630..fb623371 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,9 @@ The following options are common between both the `archive` and `download` comma - Can be specified multiple times - Disables certain modules from being used - See [Disabling Modules](#disabling-modules) for more information and a list of module names +- `--downvoted` + - This will use a user's downvoted posts as a source of posts to scrape + - This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me` - `--filename-restriction-scheme` - Can be: `windows`, `linux` - Turns off the OS detection and specifies which system to use when making filenames @@ -176,7 +179,7 @@ The following options are common between both the `archive` and `download` comma - If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We cannot bypass this.** - `-S, --sort` - This is the sort type for each applicable submission source supplied to the BDFR - - This option does not apply to upvoted or saved posts when scraping from these sources + - This option does not apply to upvoted, downvoted or saved posts when scraping from these sources - The following options are available: - `controversial` - `hot` (default) @@ -200,7 +203,7 @@ The following options are common between both the `archive` and `download` comma - Subreddits can also be used to provide CSV subreddits e.g. `-m "all, python, mindustry"` - `-t, --time` - This is the time filter that will be applied to all applicable sources - - This option does not apply to upvoted or saved posts when scraping from these sources + - This option does not apply to upvoted, downvoted or saved posts when scraping from these sources - This option only applies if sorting by top or controversial. See --sort for more detail. - The following options are available: - `all` (default) @@ -309,9 +312,9 @@ The part `-L 50` is to make sure that the character limit for a single line isn' ## Authentication and Security -The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits. +The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, downvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, downvoted posts, saved posts, and private multireddits. -To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Read this and **confirm that there are no more permissions than needed to run the program**. You should not grant unneeded permissions; by default, the BDFR only requests permission to read your saved or upvoted submissions and identify as you. +To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Read this and **confirm that there are no more permissions than needed to run the program**. You should not grant unneeded permissions; by default, the BDFR only requests permission to read your saved, upvoted, or downvoted submissions and identify as you. If the permissions look safe, confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on. diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 670f0a35..4e117165 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -19,10 +19,11 @@ click.argument("directory", type=str), click.option("--authenticate", is_flag=True, default=None), click.option("--config", type=str, default=None), - click.option("--disable-module", multiple=True, default=None, type=str), - click.option("--exclude-id", default=None, multiple=True), - click.option("--exclude-id-file", default=None, multiple=True), - click.option("--file-scheme", default=None, type=str), + click.option("--disable-module", type=str, multiple=True, default=None), + click.option("--downvoted", is_flag=True, default=None), + click.option("--exclude-id", type=str, multiple=True, default=None), + click.option("--exclude-id-file", type=str, multiple=True, default=None), + click.option("--file-scheme", type=str, default=None), click.option("--filename-restriction-scheme", type=click.Choice(("linux", "windows")), default=None), click.option("--folder-scheme", default=None, type=str), click.option("--ignore-user", type=str, multiple=True, default=None), diff --git a/bdfr/configuration.py b/bdfr/configuration.py index b7e7a251..348e2383 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -19,6 +19,7 @@ def __init__(self): self.opts: Optional[str] = None self.directory: str = "." self.disable_module: list[str] = [] + self.downvoted: bool = False self.exclude_id = [] self.exclude_id_file = [] self.file_scheme: str = "{REDDITOR}_{TITLE}_{POSTID}" diff --git a/bdfr/connector.py b/bdfr/connector.py index b31b59ea..c04bedc2 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -357,7 +357,7 @@ def create_filtered_listing_generator(self, reddit_source) -> Iterator: return sort_function(reddit_source, limit=self.args.limit) def get_user_data(self) -> list[Iterator]: - if any([self.args.submitted, self.args.upvoted, self.args.saved]): + if any([self.args.downvoted, self.args.saved, self.args.submitted, self.args.upvoted]): if not self.args.user: logger.warning("At least one user must be supplied to download user data") return [] @@ -376,7 +376,7 @@ def get_user_data(self) -> list[Iterator]: self.reddit_instance.redditor(user).submissions, ) ) - if not self.authenticated and any((self.args.upvoted, self.args.saved)): + if not self.authenticated and any((self.args.downvoted, self.args.saved, self.args.upvoted)): logger.warning("Accessing user lists requires authentication") else: if self.args.upvoted: @@ -385,6 +385,9 @@ def get_user_data(self) -> list[Iterator]: if self.args.saved: logger.debug(f"Retrieving saved posts of user {user}") generators.append(self.reddit_instance.redditor(user).saved(limit=self.args.limit)) + if self.args.downvoted: + logger.debug(f"Retrieving downvoted posts of user {user}") + generators.append(self.reddit_instance.redditor(user).downvoted(limit=self.args.limit)) except prawcore.PrawcoreException as e: logger.error(f"User {user} failed to be retrieved due to a PRAW exception: {e}") logger.debug("Waiting 60 seconds to continue") diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 711f9c08..514b290b 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -159,6 +159,7 @@ def test_cli_download_multireddit_nonexistent(test_args: list[str], tmp_path: Pa "test_args", ( ["--user", "djnish", "--submitted", "--user", "FriesWithThat", "-L", 10], + ["--user", "me", "--downvoted", "--authenticate", "-L", 10], ["--user", "me", "--upvoted", "--authenticate", "-L", 10], ["--user", "me", "--saved", "--authenticate", "-L", 10], ["--user", "me", "--submitted", "--authenticate", "-L", 10], diff --git a/tests/test_connector.py b/tests/test_connector.py index 832d2fe6..b4a7549d 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -362,6 +362,7 @@ def test_get_user_submissions(test_user: str, limit: int, downloader_mock: Magic @pytest.mark.parametrize( "test_flag", ( + "downvoted", "upvoted", "saved", ), From 13980fb6ac0a77a86c44a9ce03b0c4d7f3fe59e1 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 17 May 2023 10:54:52 +1000 Subject: [PATCH 42/82] Remove failing test case --- tests/site_downloaders/test_imgur.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 8881c319..cf5a996c 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -48,7 +48,6 @@ ("https://imgur.com/a/1qzfWtY/gifv", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), ("https://imgur.com/a/1qzfWtY/mp4", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), ("https://imgur.com/a/1qzfWtY/spqr", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), - ("https://i.imgur.com/expO7Rc.gifv", ("e309f98158fc98072eb2ae68f947f421",)), ("https://i.imgur.com/a/aqpiMuL.gif", ("5b2a9a5218bf43dc26ba41389410c981",)), ), ) From f920da569e73e0be2c232caba5f7e05e16fe414b Mon Sep 17 00:00:00 2001 From: Armin Samii Date: Tue, 23 May 2023 08:01:38 -0400 Subject: [PATCH 43/82] allow skipping of comment download --- README.md | 3 +++ bdfr/__main__.py | 1 + bdfr/archive_entry/submission_archive_entry.py | 8 ++++++-- bdfr/archiver.py | 5 ++--- bdfr/configuration.py | 1 + 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5a927bf1..f1a33af1 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,9 @@ The following options are for the `archive` command specifically. - `--comment-context` - This option will, instead of downloading an individual comment, download the submission that comment is a part of - May result in a longer run time as it retrieves much more data +- `--skip-comments` + - Skip downloading all comments. This will result in a much shorter runtime. + - Not compatible with --comment-context ### Cloner Options diff --git a/bdfr/__main__.py b/bdfr/__main__.py index dadba517..6f0f587b 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -66,6 +66,7 @@ click.option("--all-comments", is_flag=True, default=None), click.option("--comment-context", is_flag=True, default=None), click.option("-f", "--format", type=click.Choice(("xml", "json", "yaml")), default=None), + click.option("--skip-comments", is_flag=True, default=None), ] diff --git a/bdfr/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py index 38f1d347..8ff45952 100644 --- a/bdfr/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -11,11 +11,15 @@ class SubmissionArchiveEntry(BaseArchiveEntry): - def __init__(self, submission: praw.models.Submission): + def __init__(self, submission: praw.models.Submission, include_comments: bool): super(SubmissionArchiveEntry, self).__init__(submission) + self._include_comments = include_comments def compile(self) -> dict: - comments = self._get_comments() + if self._include_comments: + comments = self._get_comments() + else: + comments = [] self._get_post_details() out = self.post_details out["comments"] = comments diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 52b4649a..03754ae5 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -74,10 +74,9 @@ def get_user_data(self) -> list[Iterator]: results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit)) return results - @staticmethod - def _pull_lever_entry_factory(praw_item: Union[praw.models.Submission, praw.models.Comment]) -> BaseArchiveEntry: + def _pull_lever_entry_factory(self, praw_item: Union[praw.models.Submission, praw.models.Comment]) -> BaseArchiveEntry: if isinstance(praw_item, praw.models.Submission): - return SubmissionArchiveEntry(praw_item) + return SubmissionArchiveEntry(praw_item, not self.args.skip_comments) elif isinstance(praw_item, praw.models.Comment): return CommentArchiveEntry(praw_item) else: diff --git a/bdfr/configuration.py b/bdfr/configuration.py index 05fc27e8..0e6bb325 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -58,6 +58,7 @@ def __init__(self): self.all_comments = False self.format = "json" self.comment_context: bool = False + self.skip_comments = False def process_click_arguments(self, context: click.Context): if context.params.get("opts") is not None: From ceecb4d6acd410f21928998ce28b43b5c197c346 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Tue, 16 May 2023 15:38:53 -0400 Subject: [PATCH 44/82] Add imgchest downloader Adds downloader for imgchest albums --- bdfr/site_downloaders/download_factory.py | 3 + bdfr/site_downloaders/imgchest.py | 35 ++++++++++ tests/site_downloaders/test_imgchest.py | 83 +++++++++++++++++++++++ 3 files changed, 121 insertions(+) create mode 100644 bdfr/site_downloaders/imgchest.py create mode 100644 tests/site_downloaders/test_imgchest.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index c3b51a81..867d7b2e 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -12,6 +12,7 @@ from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat +from bdfr.site_downloaders.imgchest import Imgchest from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs @@ -45,6 +46,8 @@ def pull_lever(url: str) -> type[BaseDownloader]: return Gallery elif re.match(r"patreon\.com.*", sanitised_url): return Gallery + elif re.match(r"imgchest\.com/p/", sanitised_url): + return Imgchest elif re.match(r"reddit\.com/r/", sanitised_url): return SelfPost elif re.match(r"(m\.)?youtu\.?be", sanitised_url): diff --git a/bdfr/site_downloaders/imgchest.py b/bdfr/site_downloaders/imgchest.py new file mode 100644 index 00000000..1a1dbd61 --- /dev/null +++ b/bdfr/site_downloaders/imgchest.py @@ -0,0 +1,35 @@ +import logging +from typing import Optional + +import bs4 +from praw.models import Submission + +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader + +logger = logging.getLogger(__name__) + + +class Imgchest(BaseDownloader): + def __init__(self, post: Submission) -> None: + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + links = self._get_links(self.post.url) + if not links: + raise SiteDownloaderError("Imgchest parser could not find any links") + links = [Resource(self.post, link, Resource.retry_download(link)) for link in links] + return links + + @staticmethod + def _get_links(url: str) -> set[str]: + page = Imgchest.retrieve_url(url) + soup = bs4.BeautifulSoup(page.text, "html.parser") + album_div = soup.find("div", attrs={"id": "post-images"}) + images = album_div.find_all("img") + out = [im.get("src") for im in images] + videos = album_div.find_all("source") + out.extend([vid.get("src") for vid in videos]) + return set(out) diff --git a/tests/site_downloaders/test_imgchest.py b/tests/site_downloaders/test_imgchest.py new file mode 100644 index 00000000..64997f8f --- /dev/null +++ b/tests/site_downloaders/test_imgchest.py @@ -0,0 +1,83 @@ +from unittest.mock import Mock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.imgchest import Imgchest + + +@pytest.mark.online +@pytest.mark.parametrize( + ("test_url", "expected"), + ( + ( + "https://www.imgchest.com/p/ro24aogylj5", # Basic image album + { + "https://cdn.imgchest.com/files/jd7ogcgl5y9.jpg", + "https://cdn.imgchest.com/files/rj7kzcdv27m.jpg", + "https://cdn.imgchest.com/files/vmy2pc2pr7j.jpg", + "https://cdn.imgchest.com/files/xl7lxce967o.jpg", + }, + ), + ( + "https://www.imgchest.com/p/o24ap5wd4lj", # Image and video album + { + "https://cdn.imgchest.com/files/k46ac86kq7z.jpeg", + "https://cdn.imgchest.com/files/pyvdczlvayk.jpeg", + "https://cdn.imgchest.com/files/6yxkcvlrn7w.jpeg", + "https://cdn.imgchest.com/files/b49zce5wkyw.jpeg", + "https://cdn.imgchest.com/files/l4necb3kw4m.jpeg", + "https://cdn.imgchest.com/files/p7bwc3rx37n.mp4", + "https://cdn.imgchest.com/files/w7pjcbe587p.mp4", + "https://cdn.imgchest.com/files/d7ogcr95jy9.mp4", + "https://cdn.imgchest.com/files/j7kzc9r557m.mp4", + "https://cdn.imgchest.com/files/my2pc3wzl7j.mp4", + }, + ), + ), +) +def test_get_links(test_url: str, expected: set[str]): + results = Imgchest._get_links(test_url) + assert results == expected + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize( + ("test_url", "expected_hashes"), + ( + ( + "https://www.imgchest.com/p/ro24aogylj5", # Basic image album + { + "91f1a5919b32af6cbf5c24528e83871c", + "c4969ac347fdcefbb6b2ec01c0be02ae", + "a9db23217974d8b78c84b463224f130a", + "6a0d0e28f02c2cdccff80f9973efbad3", + }, + ), + ( + "https://www.imgchest.com/p/o24ap5wd4lj", # Image and video album + { + "a4ea3f676c8a1cbca8e2faf70a031e1e", + "59db5f35f5969d638c4036a3a249b1e1", + "73ee75fe341022cd643431a4fb78be3d", + "6fe6f1239dd39f948b3abb583c310c7d", + "8e9b652c62b906ba54607c7fd8ce6d63", + "108b167b04830ce0a59c27415bb5ef86", + "05a063fe87fb010ca782c268d0bf90c5", + "5ef705919760684d54e082430f32551a", + "7ff437036cac57e04aaabcfd604ad2c8", + "d2e3eb303f3a605b2a8587f914b78c34", + }, + ), + ), +) +def test_download_resources(test_url: str, expected_hashes: set[str]): + mock_download = Mock() + mock_download.url = test_url + downloader = Imgchest(mock_download) + results = downloader.find_resources() + assert all(isinstance(res, Resource) for res in results) + [res.download() for res in results] + hashes = {res.hash.hexdigest() for res in results} + assert hashes == set(expected_hashes) From b081f5cc377495991ef31247c8ec3808afb7ee4a Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 29 Apr 2023 21:16:37 -0400 Subject: [PATCH 45/82] Refactor Imgur Refactor Imgur to be able to test getting ID and mark downloads as slow as they are currently rate limited through Github. Also removed hash tests for nsfw links as they are being removed by Imgur. --- bdfr/site_downloaders/imgur.py | 22 ++++++------ tests/site_downloaders/test_imgur.py | 50 +++++++++++++++++++--------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/bdfr/site_downloaders/imgur.py b/bdfr/site_downloaders/imgur.py index cf15c3c0..96a43326 100644 --- a/bdfr/site_downloaders/imgur.py +++ b/bdfr/site_downloaders/imgur.py @@ -35,18 +35,20 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l return out @staticmethod - def _get_data(link: str) -> dict: + def _get_id(link: str) -> str: try: - if link.endswith("/"): - link = link.removesuffix("/") - if re.search(r".*/(.*?)(gallery/|a/)", link): - imgur_id = re.match(r".*/(?:gallery/|a/)(.*?)(?:/.*|\..{3,4})?$", link).group(1) - link = f"https://api.imgur.com/3/album/{imgur_id}" - else: - imgur_id = re.match(r".*/(.*?)(?:_d)?(?:\..{0,})?$", link).group(1) - link = f"https://api.imgur.com/3/image/{imgur_id}" + imgur_id = re.search(r"imgur\.com/(?:a/|gallery/)?([a-zA-Z0-9]+)", link).group(1) except AttributeError: raise SiteDownloaderError(f"Could not extract Imgur ID from {link}") + return imgur_id + + @staticmethod + def _get_data(link: str) -> dict: + imgur_id = Imgur._get_id(link) + if re.search(r"/(gallery|a)/", link): + api = f"https://api.imgur.com/3/album/{imgur_id}" + else: + api = f"https://api.imgur.com/3/image/{imgur_id}" headers = { "referer": "https://imgur.com/", @@ -54,7 +56,7 @@ def _get_data(link: str) -> dict: "content-type": "application/json", "Authorization": "Client-ID 546c25a59c58ad7", } - res = Imgur.retrieve_url(link, headers=headers) + res = Imgur.retrieve_url(api, headers=headers) try: image_dict = json.loads(res.text) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 8881c319..9b08d50e 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -8,7 +8,41 @@ from bdfr.site_downloaders.imgur import Imgur +@pytest.mark.parametrize( + ("test_url", "expected"), + ( + ("https://imgur.com/a/xWZsDDP", "xWZsDDP"), + ("https://imgur.com/gallery/IjJJdlC", "IjJJdlC"), + ("https://imgur.com/gallery/IjJJdlC/", "IjJJdlC"), + ("https://imgur.com/a/dcc84Gt", "dcc84Gt"), + ("https://imgur.com/a/eemHCCK", "eemHCCK"), + ("https://o.imgur.com/jZw9gq2.jpg", "jZw9gq2"), + ("https://i.imgur.com/lFJai6i.gifv", "lFJai6i"), + ("https://i.imgur.com/ywSyILa.gifv?", "ywSyILa"), + ("https://imgur.com/ubYwpbk.GIFV", "ubYwpbk"), + ("https://i.imgur.com/j1CNCZY.gifv", "j1CNCZY"), + ("https://i.imgur.com/uTvtQsw.gifv", "uTvtQsw"), + ("https://i.imgur.com/OGeVuAe.giff", "OGeVuAe"), + ("https://i.imgur.com/OGeVuAe.gift", "OGeVuAe"), + ("https://i.imgur.com/3SKrQfK.jpg?1", "3SKrQfK"), + ("https://i.imgur.com/cbivYRW.jpg?3", "cbivYRW"), + ("http://i.imgur.com/s9uXxlq.jpg?5.jpg", "s9uXxlq"), + ("http://i.imgur.com/s9uXxlqb.jpg", "s9uXxlqb"), + ("https://i.imgur.com/2TtN68l_d.webp", "2TtN68l"), + ("https://imgur.com/a/1qzfWtY/gifv", "1qzfWtY"), + ("https://imgur.com/a/1qzfWtY/mp4", "1qzfWtY"), + ("https://imgur.com/a/1qzfWtY/spqr", "1qzfWtY"), + ("https://i.imgur.com/expO7Rc.gifv", "expO7Rc"), + ("https://i.imgur.com/a/aqpiMuL.gif", "aqpiMuL"), + ), +) +def test_get_id(test_url: str, expected: str): + result = Imgur._get_id(test_url) + assert result == expected + + @pytest.mark.online +@pytest.mark.slow @pytest.mark.parametrize( ("test_url", "expected_hashes"), ( @@ -24,32 +58,16 @@ "029c475ce01b58fdf1269d8771d33913", ), ), - ( - "https://imgur.com/a/eemHCCK", - ( - "9cb757fd8f055e7ef7aa88addc9d9fa5", - "b6cb6c918e2544e96fb7c07d828774b5", - "fb6c913d721c0bbb96aa65d7f560d385", - ), - ), - ("https://o.imgur.com/jZw9gq2.jpg", ("6d6ea9aa1d98827a05425338afe675bc",)), - ("https://i.imgur.com/lFJai6i.gifv", ("01a6e79a30bec0e644e5da12365d5071",)), - ("https://i.imgur.com/ywSyILa.gifv?", ("56d4afc32d2966017c38d98568709b45",)), - ("https://imgur.com/ubYwpbk.GIFV", ("d4a774aac1667783f9ed3a1bd02fac0c",)), ("https://i.imgur.com/j1CNCZY.gifv", ("ed63d7062bc32edaeea8b53f876a307c",)), ("https://i.imgur.com/uTvtQsw.gifv", ("46c86533aa60fc0e09f2a758513e3ac2",)), ("https://i.imgur.com/OGeVuAe.giff", ("77389679084d381336f168538793f218",)), ("https://i.imgur.com/OGeVuAe.gift", ("77389679084d381336f168538793f218",)), - ("https://i.imgur.com/3SKrQfK.jpg?1", ("aa299e181b268578979cad176d1bd1d0",)), ("https://i.imgur.com/cbivYRW.jpg?3", ("7ec6ceef5380cb163a1d498c359c51fd",)), ("http://i.imgur.com/s9uXxlq.jpg?5.jpg", ("338de3c23ee21af056b3a7c154e2478f",)), ("http://i.imgur.com/s9uXxlqb.jpg", ("338de3c23ee21af056b3a7c154e2478f",)), - ("https://i.imgur.com/2TtN68l_d.webp", ("6569ab9ad9fa68d93f6b408f112dd741",)), ("https://imgur.com/a/1qzfWtY/gifv", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), ("https://imgur.com/a/1qzfWtY/mp4", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), ("https://imgur.com/a/1qzfWtY/spqr", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), - ("https://i.imgur.com/expO7Rc.gifv", ("e309f98158fc98072eb2ae68f947f421",)), - ("https://i.imgur.com/a/aqpiMuL.gif", ("5b2a9a5218bf43dc26ba41389410c981",)), ), ) def test_find_resources(test_url: str, expected_hashes: list[str]): From db08bc644c9cc40ee2ae74169b7dbcf5fb35df74 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 25 May 2023 18:13:36 -0400 Subject: [PATCH 46/82] Update tests Comment tests and remove effective duplicates. --- tests/site_downloaders/test_imgur.py | 76 ++++++++++++++-------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/tests/site_downloaders/test_imgur.py b/tests/site_downloaders/test_imgur.py index 9b08d50e..bf6535af 100644 --- a/tests/site_downloaders/test_imgur.py +++ b/tests/site_downloaders/test_imgur.py @@ -11,29 +11,22 @@ @pytest.mark.parametrize( ("test_url", "expected"), ( - ("https://imgur.com/a/xWZsDDP", "xWZsDDP"), - ("https://imgur.com/gallery/IjJJdlC", "IjJJdlC"), - ("https://imgur.com/gallery/IjJJdlC/", "IjJJdlC"), - ("https://imgur.com/a/dcc84Gt", "dcc84Gt"), - ("https://imgur.com/a/eemHCCK", "eemHCCK"), - ("https://o.imgur.com/jZw9gq2.jpg", "jZw9gq2"), - ("https://i.imgur.com/lFJai6i.gifv", "lFJai6i"), - ("https://i.imgur.com/ywSyILa.gifv?", "ywSyILa"), - ("https://imgur.com/ubYwpbk.GIFV", "ubYwpbk"), - ("https://i.imgur.com/j1CNCZY.gifv", "j1CNCZY"), - ("https://i.imgur.com/uTvtQsw.gifv", "uTvtQsw"), - ("https://i.imgur.com/OGeVuAe.giff", "OGeVuAe"), - ("https://i.imgur.com/OGeVuAe.gift", "OGeVuAe"), - ("https://i.imgur.com/3SKrQfK.jpg?1", "3SKrQfK"), - ("https://i.imgur.com/cbivYRW.jpg?3", "cbivYRW"), - ("http://i.imgur.com/s9uXxlq.jpg?5.jpg", "s9uXxlq"), - ("http://i.imgur.com/s9uXxlqb.jpg", "s9uXxlqb"), - ("https://i.imgur.com/2TtN68l_d.webp", "2TtN68l"), - ("https://imgur.com/a/1qzfWtY/gifv", "1qzfWtY"), - ("https://imgur.com/a/1qzfWtY/mp4", "1qzfWtY"), - ("https://imgur.com/a/1qzfWtY/spqr", "1qzfWtY"), - ("https://i.imgur.com/expO7Rc.gifv", "expO7Rc"), - ("https://i.imgur.com/a/aqpiMuL.gif", "aqpiMuL"), + ("https://imgur.com/a/xWZsDDP", "xWZsDDP"), # Gallery, /a/ + ("https://imgur.com/gallery/IjJJdlC", "IjJJdlC"), # Gallery, /gallery/ + ("https://imgur.com/gallery/IjJJdlC/", "IjJJdlC"), # Gallery, trailing / + ("https://o.imgur.com/jZw9gq2.jpg", "jZw9gq2"), # Direct link, jpg, incorrect subdomain + ("https://i.imgur.com/lFJai6i.gifv", "lFJai6i"), # Direct link, gifv + ("https://i.imgur.com/ywSyILa.gifv?", "ywSyILa"), # Direct link, gifv, trailing ? + ("https://imgur.com/ubYwpbk.GIFV", "ubYwpbk"), # No subdomain, uppercase gifv + ("https://i.imgur.com/OGeVuAe.giff", "OGeVuAe"), # Direct link, incorrect extension + ("https://i.imgur.com/OGeVuAe.gift", "OGeVuAe"), # Direct link, incorrect extension + ("https://i.imgur.com/3SKrQfK.jpg?1", "3SKrQfK"), # Direct link, trainling ?1 + ("https://i.imgur.com/cbivYRW.jpg?3", "cbivYRW"), # Direct link, trailing ?3 + ("http://i.imgur.com/s9uXxlq.jpg?5.jpg", "s9uXxlq"), # Direct link, trailing ?5.jpg, http + ("http://i.imgur.com/s9uXxlqb.jpg", "s9uXxlqb"), # Direct link, jpg, http + ("https://i.imgur.com/2TtN68l_d.webp", "2TtN68l"), # Direct link, webp, _d thumbnail + ("https://imgur.com/a/1qzfWtY/gifv", "1qzfWtY"), # Gallery, trailing filetype + ("https://imgur.com/a/1qzfWtY/spqr", "1qzfWtY"), # Gallery, trailing non filetype ), ) def test_get_id(test_url: str, expected: str): @@ -46,11 +39,10 @@ def test_get_id(test_url: str, expected: str): @pytest.mark.parametrize( ("test_url", "expected_hashes"), ( - ("https://imgur.com/a/xWZsDDP", ("f551d6e6b0fef2ce909767338612e31b",)), - ("https://imgur.com/gallery/IjJJdlC", ("740b006cf9ec9d6f734b6e8f5130bdab",)), - ("https://imgur.com/gallery/IjJJdlC/", ("740b006cf9ec9d6f734b6e8f5130bdab",)), + ("https://imgur.com/a/xWZsDDP", ("f551d6e6b0fef2ce909767338612e31b",)), # Single image gallery + ("https://imgur.com/gallery/IjJJdlC", ("740b006cf9ec9d6f734b6e8f5130bdab",)), # Single video gallery ( - "https://imgur.com/a/dcc84Gt", + "https://imgur.com/a/dcc84Gt", # Multiple image gallery ( "cf1158e1de5c3c8993461383b96610cf", "28d6b791a2daef8aa363bf5a3198535d", @@ -58,16 +50,26 @@ def test_get_id(test_url: str, expected: str): "029c475ce01b58fdf1269d8771d33913", ), ), - ("https://i.imgur.com/j1CNCZY.gifv", ("ed63d7062bc32edaeea8b53f876a307c",)), - ("https://i.imgur.com/uTvtQsw.gifv", ("46c86533aa60fc0e09f2a758513e3ac2",)), - ("https://i.imgur.com/OGeVuAe.giff", ("77389679084d381336f168538793f218",)), - ("https://i.imgur.com/OGeVuAe.gift", ("77389679084d381336f168538793f218",)), - ("https://i.imgur.com/cbivYRW.jpg?3", ("7ec6ceef5380cb163a1d498c359c51fd",)), - ("http://i.imgur.com/s9uXxlq.jpg?5.jpg", ("338de3c23ee21af056b3a7c154e2478f",)), - ("http://i.imgur.com/s9uXxlqb.jpg", ("338de3c23ee21af056b3a7c154e2478f",)), - ("https://imgur.com/a/1qzfWtY/gifv", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), - ("https://imgur.com/a/1qzfWtY/mp4", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), - ("https://imgur.com/a/1qzfWtY/spqr", ("65fbc7ba5c3ed0e3af47c4feef4d3735",)), + ("https://i.imgur.com/j1CNCZY.gifv", ("ed63d7062bc32edaeea8b53f876a307c",)), # Direct video link + ("https://i.imgur.com/uTvtQsw.gifv", ("46c86533aa60fc0e09f2a758513e3ac2",)), # Direct video link + ( + "https://i.imgur.com/OGeVuAe.giff", # Direct video link, incorrect extension + ("77389679084d381336f168538793f218",), + ), + ("https://i.imgur.com/cbivYRW.jpg?3", ("7ec6ceef5380cb163a1d498c359c51fd",)), # Direct image link, trailing ?3 + ( + "http://i.imgur.com/s9uXxlq.jpg?5.jpg", # Direct image link, trailing ?5.jpg + ("338de3c23ee21af056b3a7c154e2478f",), + ), + ("http://i.imgur.com/s9uXxlqb.jpg", ("338de3c23ee21af056b3a7c154e2478f",)), # Direct image link + ( + "https://imgur.com/a/1qzfWtY/mp4", # Single video gallery, web filetype request + ("65fbc7ba5c3ed0e3af47c4feef4d3735",), + ), + ( + "https://imgur.com/a/1qzfWtY/spqr", # Single video gallery, web filetype invalid + ("65fbc7ba5c3ed0e3af47c4feef4d3735",), + ), ), ) def test_find_resources(test_url: str, expected_hashes: list[str]): From 9c0017c11b4c9577afdd9b8d8ea0d03dad30ce77 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 25 May 2023 13:27:11 -0400 Subject: [PATCH 47/82] Fix Erome test Replaces failing erome tests. --- tests/site_downloaders/test_erome.py | 30 +++++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/site_downloaders/test_erome.py b/tests/site_downloaders/test_erome.py index 7dd31813..911d2641 100644 --- a/tests/site_downloaders/test_erome.py +++ b/tests/site_downloaders/test_erome.py @@ -12,17 +12,23 @@ @pytest.mark.parametrize( ("test_url", "expected_urls"), ( - ("https://www.erome.com/a/vqtPuLXh", (r"https://[a-z]\d+.erome.com/\d{3}/vqtPuLXh/KH2qBT99_480p.mp4",)), ( - "https://www.erome.com/a/ORhX0FZz", + "https://www.erome.com/a/vqtPuLXh", # Video + (r"https://[a-z]\d+.erome.com/\d{3}/vqtPuLXh/KH2qBT99_480p.mp4",), + ), + ( + "https://www.erome.com/a/9E50Xkb6", # Image album ( - r"https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/9IYQocM9_480p.mp4", - r"https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/9eEDc8xm_480p.mp4", - r"https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/EvApC7Rp_480p.mp4", - r"https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/LruobtMs_480p.mp4", - r"https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/TJNmSUU5_480p.mp4", - r"https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/X11Skh6Z_480p.mp4", - r"https://[a-z]\d+.erome.com/\d{3}/ORhX0FZz/bjlTkpn7_480p.mp4", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/hUpc1d21.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/3zZF7uv4.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/h6C03hNq.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/AHQuZh9j.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/Ram0NmDU.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/dY82guy1.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/3x8bp9lF.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/lxyFSUMQ.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/vPIb29UR.jpeg", + r"https://[a-z]\d+.erome.com/\d{4}/9E50Xkb6/w1BJtyh5.jpeg", ), ), ), @@ -37,9 +43,9 @@ def test_get_link(test_url: str, expected_urls: tuple[str]): @pytest.mark.parametrize( ("test_url", "expected_hashes_len"), ( - ("https://www.erome.com/a/vqtPuLXh", 1), - ("https://www.erome.com/a/4tP3KI6F", 1), - ("https://www.erome.com/a/WNyK674a", 41), + ("https://www.erome.com/a/vqtPuLXh", 1), # Video + ("https://www.erome.com/a/4tP3KI6F", 1), # Video + ("https://www.erome.com/a/9E50Xkb6", 10), # Image album ), ) def test_download_resource(test_url: str, expected_hashes_len: int): From 288c42f3929cd8cde32d759235e9ae92bbc9687d Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sun, 4 Jun 2023 17:46:51 -0400 Subject: [PATCH 48/82] RUF001 --- pyproject.toml | 1 + tests/test_file_name_formatter.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c4d5f08f..183f9c04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,7 @@ markers = [ [tool.ruff] exclude = ["scripts/tests"] +external = ["FURB123"] flake8-annotations = {"allow-star-arg-any" = true, "suppress-dummy-args" = true} flake8-pytest-style = {"parametrize-values-type" = "tuple", "mark-parentheses" = false} format = "grouped" diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index a4d966bb..7afc2b8a 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -427,8 +427,8 @@ def test_multilevel_folder_scheme( ("test", "test"), ("😍", "😍"), ("test😍", "test😍"), - ("test😍 ’", "test😍 ’"), - ("test😍 \\u2019", "test😍 ’"), + ("test😍 ’", "test😍 ’"), # noqa: RUF001 + ("test😍 \\u2019", "test😍 ’"), # noqa: RUF001 ("Using that real good [1\\4]", "Using that real good [1\\4]"), ), ) @@ -442,8 +442,8 @@ def test_preserve_emojis(test_name_string: str, expected: str, submission: Magic @pytest.mark.parametrize( ("test_string", "expected"), ( - ("test \\u2019", "test ’"), - ("My cat\\u2019s paws are so cute", "My cat’s paws are so cute"), + ("test \\u2019", "test ’"), # noqa: RUF001 + ("My cat\\u2019s paws are so cute", "My cat’s paws are so cute"), # noqa: RUF001 ), ) def test_convert_unicode_escapes(test_string: str, expected: str): From 4fb48752adf7d9b68ab8786575ca814a12232085 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sun, 4 Jun 2023 17:47:15 -0400 Subject: [PATCH 49/82] RUF005 --- tests/integration_tests/test_archive_integration.py | 3 ++- tests/integration_tests/test_clone_integration.py | 3 ++- tests/integration_tests/test_download_integration.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index 329b9a17..aa98b2c4 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -28,7 +28,8 @@ def create_basic_args_for_archive_runner(test_args: list[str], run_path: Path): str(Path(run_path, "test_config.cfg")), "--log", str(Path(run_path, "test_log.txt")), - ] + test_args + *test_args, + ] return out diff --git a/tests/integration_tests/test_clone_integration.py b/tests/integration_tests/test_clone_integration.py index bc250e21..c5aa9bba 100644 --- a/tests/integration_tests/test_clone_integration.py +++ b/tests/integration_tests/test_clone_integration.py @@ -27,7 +27,8 @@ def create_basic_args_for_cloner_runner(test_args: list[str], tmp_path: Path): str(Path(tmp_path, "test_config.cfg")), "--log", str(Path(tmp_path, "test_log.txt")), - ] + test_args + *test_args, + ] return out diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index e9dc5e28..188d5528 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -28,7 +28,8 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): str(Path(run_path, "test_config.cfg")), "--log", str(Path(run_path, "test_log.txt")), - ] + test_args + *test_args, + ] return out From a8c757072b914519050560fd2efe0e10daee1284 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 20 Apr 2023 15:42:53 -0400 Subject: [PATCH 50/82] B607/603 coverage --- bdfr/completion.py | 12 +++++++++--- bdfr/file_name_formatter.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bdfr/completion.py b/bdfr/completion.py index 3d8ccb97..9bb2a82a 100644 --- a/bdfr/completion.py +++ b/bdfr/completion.py @@ -23,7 +23,9 @@ def install(self) -> None: for point in self.entry_points: self.env[f"_{point.upper().replace('-', '_')}_COMPLETE"] = "bash_source" with Path(comp_dir + point).open(mode="w") as file: - file.write(subprocess.run([point], env=self.env, capture_output=True, text=True).stdout) + file.write( + subprocess.run([point], env=self.env, capture_output=True, text=True).stdout, # noqa: S603 + ) print(f"Bash completion for {point} written to {comp_dir}{point}") if self.shell in ("all", "fish"): comp_dir = self.share_dir + "/fish/vendor_completions.d/" @@ -33,7 +35,9 @@ def install(self) -> None: for point in self.entry_points: self.env[f"_{point.upper().replace('-', '_')}_COMPLETE"] = "fish_source" with Path(comp_dir + point + ".fish").open(mode="w") as file: - file.write(subprocess.run([point], env=self.env, capture_output=True, text=True).stdout) + file.write( + subprocess.run([point], env=self.env, capture_output=True, text=True).stdout, # noqa: S603 + ) print(f"Fish completion for {point} written to {comp_dir}{point}.fish") if self.shell in ("all", "zsh"): comp_dir = self.share_dir + "/zsh/site-functions/" @@ -43,7 +47,9 @@ def install(self) -> None: for point in self.entry_points: self.env[f"_{point.upper().replace('-', '_')}_COMPLETE"] = "zsh_source" with Path(comp_dir + "_" + point).open(mode="w") as file: - file.write(subprocess.run([point], env=self.env, capture_output=True, text=True).stdout) + file.write( + subprocess.run([point], env=self.env, capture_output=True, text=True).stdout, # noqa: S603 + ) print(f"Zsh completion for {point} written to {comp_dir}_{point}") def uninstall(self) -> None: diff --git a/bdfr/file_name_formatter.py b/bdfr/file_name_formatter.py index 10c54b30..5ddedeca 100644 --- a/bdfr/file_name_formatter.py +++ b/bdfr/file_name_formatter.py @@ -171,7 +171,7 @@ def limit_file_name_length(self, filename: str, ending: str, root: Path) -> Path @staticmethod def find_max_path_length() -> int: try: - return int(subprocess.check_output(["getconf", "PATH_MAX", "/"])) + return int(subprocess.check_output(["getconf", "PATH_MAX", "/"])) # noqa: S603, S607 except (ValueError, subprocess.CalledProcessError, OSError): if platform.system() == "Windows": return FileNameFormatter.WINDOWS_MAX_PATH_LENGTH From 62fe08db130b9bcdb060b14bc9d04b21acb94572 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Mon, 5 Jun 2023 20:44:46 -0400 Subject: [PATCH 51/82] B311 --- bdfr/oauth2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/oauth2.py b/bdfr/oauth2.py index af31f828..bf891f4d 100644 --- a/bdfr/oauth2.py +++ b/bdfr/oauth2.py @@ -50,7 +50,7 @@ def retrieve_new_token(self) -> str: client_id=self.client_id, client_secret=self.client_secret, ) - state = str(random.randint(0, 65000)) + state = str(random.randint(0, 65000)) # noqa: S311 url = reddit.auth.url(self.scopes, state, "permanent") logger.warning("Authentication action required before the program can proceed") logger.warning(f"Authenticate at {url}") From 1d95778b037308092f16ce0f814b3912e69df198 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Mon, 5 Jun 2023 19:59:37 -0400 Subject: [PATCH 52/82] Connector test edge case Covers edge case of not having the exact amount as the limit, now allows between one and the limit. --- tests/test_connector.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/test_connector.py b/tests/test_connector.py index b4a7549d..55ac6bfa 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -3,6 +3,7 @@ from collections.abc import Iterator from datetime import datetime, timedelta from pathlib import Path +from typing import Union from unittest.mock import MagicMock import praw @@ -37,21 +38,23 @@ def downloader_mock(args: Configuration): return downloader_mock -def assert_all_results_are_submissions(result_limit: int, results: list[Iterator]) -> list: +def assert_all_results_are_submissions(result_limit: Union[int, None], results: list[Iterator]) -> list: results = [sub for res in results for sub in res] assert all([isinstance(res, praw.models.Submission) for res in results]) assert not any([isinstance(m, MagicMock) for m in results]) if result_limit is not None: - assert len(results) == result_limit + assert len(results) > 0 + assert len(results) <= result_limit return results -def assert_all_results_are_submissions_or_comments(result_limit: int, results: list[Iterator]) -> list: +def assert_all_results_are_submissions_or_comments(result_limit: Union[int, None], results: list[Iterator]) -> list: results = [sub for res in results for sub in res] assert all([isinstance(res, (praw.models.Submission, praw.models.Comment)) for res in results]) assert not any([isinstance(m, MagicMock) for m in results]) if result_limit is not None: - assert len(results) == result_limit + assert len(results) > 0 + assert len(results) <= result_limit return results From 42264f0872735cd75756171d5c1f8b65166b3ddb Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 8 Jun 2023 23:08:37 -0400 Subject: [PATCH 53/82] Add Flickr --- bdfr/site_downloaders/download_factory.py | 3 + bdfr/site_downloaders/flickr.py | 111 ++++++++++++++++++++++ tests/site_downloaders/test_flickr.py | 91 ++++++++++++++++++ 3 files changed, 205 insertions(+) create mode 100644 bdfr/site_downloaders/flickr.py create mode 100644 tests/site_downloaders/test_flickr.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 867d7b2e..e3d36506 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -10,6 +10,7 @@ from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.erome import Erome from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback +from bdfr.site_downloaders.flickr import Flickr from bdfr.site_downloaders.gallery import Gallery from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgchest import Imgchest @@ -42,6 +43,8 @@ def pull_lever(url: str) -> type[BaseDownloader]: return Catbox elif re.match(r"delayforreddit\.com", sanitised_url): return DelayForReddit + elif re.match(r"flickr\.com", sanitised_url) or re.match(r"flic\.kr", sanitised_url): + return Flickr elif re.match(r"reddit\.com/gallery/.*", sanitised_url): return Gallery elif re.match(r"patreon\.com.*", sanitised_url): diff --git a/bdfr/site_downloaders/flickr.py b/bdfr/site_downloaders/flickr.py new file mode 100644 index 00000000..dc08660e --- /dev/null +++ b/bdfr/site_downloaders/flickr.py @@ -0,0 +1,111 @@ +import json +import re +from typing import Optional + +from bs4 import BeautifulSoup +from cachetools import TTLCache, cached +from praw.models import Submission + +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader + + +class Flickr(BaseDownloader): + def __init__(self, post: Submission) -> None: + super().__init__(post) + self.raw_data = {} + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + links = self._get_data(self.post.url) + if not links: + raise SiteDownloaderError("Flickr could not find any images to download") + return [Resource(self.post, link, Resource.retry_download(link)) for link in links] + + @staticmethod + @cached(cache=TTLCache(maxsize=5, ttl=10260)) + def _get_api_key() -> str: + key_regex = re.compile(r".*api_key=(\w*)(&.*)?") + req = Flickr.retrieve_url("https://www.flickr.com/services/api/response.json.html").text + elements = BeautifulSoup(req, "html.parser") + links = elements.find_all("a", href=True, string="here") + return key_regex.search(str(links[0])).group(1) + + @staticmethod + def _get_ids(link: str) -> str: + flickr_regex = re.compile(r".*/photos/(?P\d*@\D\d*|\w*)/(?:albums/(?P\d*)|(?P\d*))") + try: + flickr_id = flickr_regex.search(link).group("photo") + if not flickr_id: + flickr_id = flickr_regex.search(link).group("album") + user = flickr_regex.search(link).group("user") + except AttributeError: + raise SiteDownloaderError(f"Could not extract Flickr ID from {link}") + return user, flickr_id + + @staticmethod + def _construct_direct_link(image_dict: json) -> str: + image_id = image_dict["photo"]["id"] + secret = image_dict["photo"]["secret"] + server = image_dict["photo"]["server"] + originalsecret = None + if "originalsecret" in image_dict["photo"]: + originalsecret = image_dict["photo"]["originalsecret"] + if "originalformat" in image_dict["photo"]: + originalformat = image_dict["photo"]["originalformat"] + if originalsecret: + return f"https://live.staticflickr.com/{server}/{image_id}_{originalsecret}_o.{originalformat}" + return f"https://live.staticflickr.com/{server}/{image_id}_{secret}_b.jpg" + + @staticmethod + def _get_album_links(album_dict: json, api_string: str) -> list: + out = [] + for photo in album_dict["photoset"]["photo"]: + res = Flickr.retrieve_url(f"{api_string}method=flickr.photos.getInfo&photo_id={photo['id']}") + image_dict = json.loads(res.text) + out.append(Flickr._construct_direct_link(image_dict)) + return out + + @staticmethod + def _get_user_id(user: str, api_string: str) -> str: + try: + req = Flickr.retrieve_url( + f"{api_string}method=flickr.urls.lookupUser&url=https://flickr.com/photos/{user}", + ).text + return json.loads(req)["user"]["id"] + except json.JSONDecodeError as e: + raise SiteDownloaderError(f"Could not parse flickr user ID from API: {e}") + + @staticmethod + def _expand_link(link: str) -> str: + return Flickr.retrieve_url(link).url + + @staticmethod + def _get_data(link: str) -> list: + if ("/gp/" in link) or ("flic.kr" in link): + link = Flickr._expand_link(link) + user, flickr_id = Flickr._get_ids(link) + api_key = Flickr._get_api_key() + api_string = f"https://www.flickr.com/services/rest/?api_key={api_key}&format=json&nojsoncallback=1&" + album = False + if "/albums/" in link: + if "@" not in user: + user = Flickr._get_user_id(user, api_string) + api = f"{api_string}method=flickr.photosets.getPhotos&photoset_id={flickr_id}&user_id={user}" + album = True + else: + api = f"{api_string}method=flickr.photos.getInfo&photo_id={flickr_id}" + + res = Flickr.retrieve_url(api) + + try: + image_dict = json.loads(res.text) + except json.JSONDecodeError as e: + raise SiteDownloaderError(f"Could not parse received response as JSON: {e}") + + image_dict = ( + Flickr._get_album_links(image_dict, api_string) if album else [Flickr._construct_direct_link(image_dict)] + ) + + return image_dict diff --git a/tests/site_downloaders/test_flickr.py b/tests/site_downloaders/test_flickr.py new file mode 100644 index 00000000..436de3e7 --- /dev/null +++ b/tests/site_downloaders/test_flickr.py @@ -0,0 +1,91 @@ +from unittest.mock import Mock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.flickr import Flickr + + +@pytest.mark.online +def test_key_cache(): + key1 = Flickr._get_api_key() + key2 = Flickr._get_api_key() + assert key1 == key2 + + +@pytest.mark.parametrize( + ("test_url", "expected_user", "expected_id"), + ( + ("https://www.flickr.com/photos/137434519@N08/33635695603", "137434519@N08", "33635695603"), # Single photo + ( + "https://www.flickr.com/photos/63215229@N04/albums/72157644975251416", # Album + "63215229@N04", + "72157644975251416", + ), + ), +) +def test_get_ids(test_url: str, expected_user: str, expected_id: str): + user, f_id = Flickr._get_ids(test_url) + assert user == expected_user + assert f_id == expected_id + + +@pytest.mark.online +@pytest.mark.parametrize( + ("test_url", "expected_url"), + ( + ( + "https://www.flickr.com/gp/137434519@N08/83Q029", # /gp/ link + "https://www.flickr.com/photos/137434519@N08/33635695603/", + ), + ("https://flic.kr/p/2k5E4mv", "https://www.flickr.com/photos/129756120@N03/50592162657/"), # flic.kr link + ), +) +def test_expand_url(test_url: str, expected_url: str): + link = Flickr._expand_link(test_url) + assert link == expected_url + + +@pytest.mark.online +@pytest.mark.parametrize( + ("test_id", "expected_user"), + (("buta_suneo", "63215229@N04"),), # username to user ID +) +def test_get_user_id(test_id: str, expected_user: str): + api_key = Flickr._get_api_key() + api_string = f"https://www.flickr.com/services/rest/?api_key={api_key}&format=json&nojsoncallback=1&" + user = Flickr._get_user_id(test_id, api_string) + assert user == expected_user + + +@pytest.mark.online +@pytest.mark.parametrize( + ("test_url", "expected_hashes"), + ( + ("https://www.flickr.com/gp/137434519@N08/83Q029", {"b3f4e6fca1cc0ffca55368e4f94f9b5f"}), # Single photo + ("https://flic.kr/p/2k5E4mv", {"75ae4f5e70b9b7525041b1dcc852d144"}), # Single photo + ( + "http://www.flickr.com/photos/thekog/6886709962/", # Single photo + {"a4a64e606368f7b5a1995c84e15463e9"}, + ), + ( + "https://www.flickr.com/photos/ochre_jelly/albums/72157708743730852", # Album + { + "3c442ffdadff7b02cb7a133865339a26", + "8023fc0e76f891d585871ddd64edac23", + "9bbedad97b59ec51cb967da507351912", + "a86fcd3458620eec4cb3606882d11e9a", + "addb62d788c542383d1ad47914bbefb3", + }, + ), + ), +) +def test_download_resource(test_url: str, expected_hashes: set[str]): + mock_submission = Mock() + mock_submission.url = test_url + test_site = Flickr(mock_submission) + results = test_site.find_resources() + assert all(isinstance(res, Resource) for res in results) + [res.download() for res in results] + hashes = {res.hash.hexdigest() for res in results} + assert hashes == set(expected_hashes) From 663b9a9a2abda38cfd741ab055a5f840f951ba33 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Thu, 8 Jun 2023 23:40:38 -0400 Subject: [PATCH 54/82] remove failing test Resource deleted, Error 410. --- tests/site_downloaders/test_redgifs.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index 86478f05..b8a6ed40 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -105,11 +105,6 @@ def test_download_resource(test_url: str, expected_hashes: set[str]): {"FlippantMemorableBaiji-mobile.mp4"}, {"41a5fb4865367ede9f65fc78736f497a"}, ), - ( - "https://redgifs.com/watch/thirstyunfortunatewaterdragons", - {"thirstyunfortunatewaterdragons-mobile.mp4"}, - {"1a51dad8fedb594bdd84f027b3cbe8af"}, - ), ( "https://redgifs.com/watch/conventionalplainxenopterygii", {"conventionalplainxenopterygii-mobile.mp4"}, From 912311ee3dcf87becac7be59d31e5fe7dcb784d2 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Mon, 12 Jun 2023 22:50:31 -0400 Subject: [PATCH 55/82] Flickr update Adds coverage for videos. --- bdfr/site_downloaders/flickr.py | 28 ++++++++++++++++++++++----- tests/site_downloaders/test_flickr.py | 1 + 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/bdfr/site_downloaders/flickr.py b/bdfr/site_downloaders/flickr.py index dc08660e..db2d4e71 100644 --- a/bdfr/site_downloaders/flickr.py +++ b/bdfr/site_downloaders/flickr.py @@ -6,7 +6,7 @@ from cachetools import TTLCache, cached from praw.models import Submission -from bdfr.exceptions import SiteDownloaderError +from bdfr.exceptions import ResourceNotFound, SiteDownloaderError from bdfr.resource import Resource from bdfr.site_authenticator import SiteAuthenticator from bdfr.site_downloaders.base_downloader import BaseDownloader @@ -27,8 +27,8 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l @cached(cache=TTLCache(maxsize=5, ttl=10260)) def _get_api_key() -> str: key_regex = re.compile(r".*api_key=(\w*)(&.*)?") - req = Flickr.retrieve_url("https://www.flickr.com/services/api/response.json.html").text - elements = BeautifulSoup(req, "html.parser") + res = Flickr.retrieve_url("https://www.flickr.com/services/api/response.json.html").text + elements = BeautifulSoup(res, "html.parser") links = elements.find_all("a", href=True, string="here") return key_regex.search(str(links[0])).group(1) @@ -49,11 +49,29 @@ def _construct_direct_link(image_dict: json) -> str: image_id = image_dict["photo"]["id"] secret = image_dict["photo"]["secret"] server = image_dict["photo"]["server"] + user = image_dict["photo"]["owner"]["nsid"] originalsecret = None if "originalsecret" in image_dict["photo"]: originalsecret = image_dict["photo"]["originalsecret"] if "originalformat" in image_dict["photo"]: originalformat = image_dict["photo"]["originalformat"] + if image_dict["photo"]["media"] == "video": + if originalsecret: + return Flickr.retrieve_url( + f"https://flickr.com/photos/{user}/{image_id}/play/orig/{originalsecret}/", + ).url + try: + return Flickr.retrieve_url(f"https://flickr.com/photos/{user}/{image_id}/play/1080p/{secret}/").url + except ResourceNotFound: + try: + return Flickr.retrieve_url(f"https://flickr.com/photos/{user}/{image_id}/play/720p/{secret}/").url + except ResourceNotFound: + try: + return Flickr.retrieve_url( + f"https://flickr.com/photos/{user}/{image_id}/play/360p/{secret}/", + ).url + except ResourceNotFound: + raise SiteDownloaderError("Could not find correct video from Flickr") if originalsecret: return f"https://live.staticflickr.com/{server}/{image_id}_{originalsecret}_o.{originalformat}" return f"https://live.staticflickr.com/{server}/{image_id}_{secret}_b.jpg" @@ -70,10 +88,10 @@ def _get_album_links(album_dict: json, api_string: str) -> list: @staticmethod def _get_user_id(user: str, api_string: str) -> str: try: - req = Flickr.retrieve_url( + res = Flickr.retrieve_url( f"{api_string}method=flickr.urls.lookupUser&url=https://flickr.com/photos/{user}", ).text - return json.loads(req)["user"]["id"] + return json.loads(res)["user"]["id"] except json.JSONDecodeError as e: raise SiteDownloaderError(f"Could not parse flickr user ID from API: {e}") diff --git a/tests/site_downloaders/test_flickr.py b/tests/site_downloaders/test_flickr.py index 436de3e7..7811efac 100644 --- a/tests/site_downloaders/test_flickr.py +++ b/tests/site_downloaders/test_flickr.py @@ -78,6 +78,7 @@ def test_get_user_id(test_id: str, expected_user: str): "addb62d788c542383d1ad47914bbefb3", }, ), + ("https://www.flickr.com/photos/eerokiuru/52902303276", {"adfd8175f398f87744285da2591c8215"}), # Single video ), ) def test_download_resource(test_url: str, expected_hashes: set[str]): From 68b2b9629535e921b27f7deee6a15b1d0a1aaea5 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Mon, 12 Jun 2023 18:05:53 -0400 Subject: [PATCH 56/82] Update versions --- .pre-commit-config.yaml | 7 ++++--- pyproject.toml | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1bf956c5..0f97a54b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,22 +3,23 @@ repos: - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.12.1 + rev: v0.13 hooks: - id: validate-pyproject name: validate-pyproject - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 23.3.0 hooks: - id: black name: black - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.254 + rev: v0.0.272 hooks: - id: ruff name: ruff + args: ["--fixable=I","--fix"] - repo: https://github.com/markdownlint/markdownlint rev: v0.12.0 diff --git a/pyproject.toml b/pyproject.toml index 183f9c04..f39997e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,10 +42,10 @@ data-files = {"config" = ["bdfr/default_config.cfg",]} [project.optional-dependencies] dev = [ - "black>=23.1.0", + "black>=23.3.0", "pre-commit>=3.0.4", "pytest>=7.2.1", - "ruff>=0.0.254", + "ruff>=0.0.272", "tox>=3.27.1", ] From 192ea38ea1cc2d6ce93fe166b89bfb023e64012c Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Wed, 8 Mar 2023 06:18:05 -0500 Subject: [PATCH 57/82] Oauth2 updates Change to allow use of client id without secret. --- bdfr/connector.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bdfr/connector.py b/bdfr/connector.py index 93652c88..1f2c334a 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -133,14 +133,18 @@ def parse_disabled_modules(self) -> None: def create_reddit_instance(self) -> None: if self.args.authenticate: logger.debug("Using authenticated Reddit instance") + client_id = self.cfg_parser.get("DEFAULT", "client_id") + client_secret = self.cfg_parser.get("DEFAULT", "client_secret", fallback=None) + if client_secret and client_secret.lower() == "none": + client_secret = None if not self.cfg_parser.has_option("DEFAULT", "user_token"): logger.log(9, "Commencing OAuth2 authentication") scopes = self.cfg_parser.get("DEFAULT", "scopes", fallback="identity, history, read, save") scopes = OAuth2Authenticator.split_scopes(scopes) oauth2_authenticator = OAuth2Authenticator( - scopes, - self.cfg_parser.get("DEFAULT", "client_id"), - self.cfg_parser.get("DEFAULT", "client_secret"), + wanted_scopes=scopes, + client_id=client_id, + client_secret=client_secret, user_agent=self.user_agent, ) token = oauth2_authenticator.retrieve_new_token() @@ -151,17 +155,20 @@ def create_reddit_instance(self) -> None: self.authenticated = True self.reddit_instance = praw.Reddit( - client_id=self.cfg_parser.get("DEFAULT", "client_id"), - client_secret=self.cfg_parser.get("DEFAULT", "client_secret"), + client_id=client_id, + client_secret=client_secret, user_agent=self.user_agent, token_manager=token_manager, ) else: logger.debug("Using unauthenticated Reddit instance") self.authenticated = False + client_secret = self.cfg_parser.get("DEFAULT", "client_secret", fallback=None) + if client_secret and client_secret.lower() == "none": + client_secret = None self.reddit_instance = praw.Reddit( client_id=self.cfg_parser.get("DEFAULT", "client_id"), - client_secret=self.cfg_parser.get("DEFAULT", "client_secret"), + client_secret=client_secret, user_agent=self.user_agent, ) From 7f4a84a3526bcf4fcfe974130437a04f0fce5ce1 Mon Sep 17 00:00:00 2001 From: Soulsuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Sat, 17 Jun 2023 16:08:45 -0400 Subject: [PATCH 58/82] Add site nsfw.pics --- bdfr/site_downloaders/download_factory.py | 3 + bdfr/site_downloaders/nsfw_pics.py | 46 ++++++++++++ tests/site_downloaders/test_nsfw_pics.py | 85 +++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 bdfr/site_downloaders/nsfw_pics.py create mode 100644 tests/site_downloaders/test_nsfw_pics.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index e3d36506..194a3ec2 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -15,6 +15,7 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgchest import Imgchest from bdfr.site_downloaders.imgur import Imgur +from bdfr.site_downloaders.nsfw_pics import NsfwPics from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost @@ -51,6 +52,8 @@ def pull_lever(url: str) -> type[BaseDownloader]: return Gallery elif re.match(r"imgchest\.com/p/", sanitised_url): return Imgchest + elif re.match(r"nsfw\.pics", sanitised_url): + return NsfwPics elif re.match(r"reddit\.com/r/", sanitised_url): return SelfPost elif re.match(r"(m\.)?youtu\.?be", sanitised_url): diff --git a/bdfr/site_downloaders/nsfw_pics.py b/bdfr/site_downloaders/nsfw_pics.py new file mode 100644 index 00000000..d6221c5d --- /dev/null +++ b/bdfr/site_downloaders/nsfw_pics.py @@ -0,0 +1,46 @@ +import logging +from typing import Optional + +import bs4 +from praw.models import Submission + +from bdfr.exceptions import SiteDownloaderError +from bdfr.resource import Resource +from bdfr.site_authenticator import SiteAuthenticator +from bdfr.site_downloaders.base_downloader import BaseDownloader + +logger = logging.getLogger(__name__) + + +class NsfwPics(BaseDownloader): + def __init__(self, post: Submission) -> None: + super().__init__(post) + + def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: + links = self._get_links(self.post.url) + if not links: + raise SiteDownloaderError("nsfw.pics parser could not find any links") + links = [Resource(self.post, link, Resource.retry_download(link)) for link in links] + return links + + @staticmethod + def _get_album_links(url: str) -> list: + image_pages = [] + album = NsfwPics.retrieve_url(f"{url}") + soup = bs4.BeautifulSoup(album.text, "html.parser") + album_divs = soup.find("div", attrs={"class": "pad-content-listing"}) + links = album_divs.find_all("div", {"data-type": "image"}) + for link in links: + image_pages.append(link.get("data-url-short")) + return image_pages + + @staticmethod + def _get_links(url: str) -> set[str]: + resources = [] + urls = NsfwPics._get_album_links(url) if "/album/" in url else [url] + for url in urls: + page = NsfwPics.retrieve_url(url) + soup = bs4.BeautifulSoup(page.text, "html.parser") + image_link = soup.find("input", attrs={"id": "embed-code-2"}).get("value") + resources.append(image_link) + return set(resources) diff --git a/tests/site_downloaders/test_nsfw_pics.py b/tests/site_downloaders/test_nsfw_pics.py new file mode 100644 index 00000000..7516807b --- /dev/null +++ b/tests/site_downloaders/test_nsfw_pics.py @@ -0,0 +1,85 @@ +from unittest.mock import Mock + +import pytest + +from bdfr.resource import Resource +from bdfr.site_downloaders.nsfw_pics import NsfwPics + + +@pytest.mark.online +@pytest.mark.parametrize( + ("test_url", "expected"), + ( + ( + "https://nsfw.pics/album/Test.l2t", # Album + { + "https://nsfw.pics/image/OdfV", + "https://nsfw.pics/image/ObUF", + "https://nsfw.pics/image/OOV7", + "https://nsfw.pics/image/OD71", + "https://nsfw.pics/image/O6du", + }, + ), + ), +) +def test_get_album(test_url: str, expected: set[str]): + results = NsfwPics._get_album_links(test_url) + assert len(results) == len(expected) + assert sorted(results) == sorted(expected) + + +@pytest.mark.online +@pytest.mark.parametrize( + ("test_url", "expected"), + ( + ( + "https://nsfw.pics/album/Test.l2t", # Album + { + "https://i.nsfw.pics/b8007b506022132fe857eead3dc98a92.gif", + "https://i.nsfw.pics/aa0541830d5d16743bca9bfb48e16b7b.gif", + "https://i.nsfw.pics/b4afb5a33e68d3d74a547f62684cddc9.jpeg", + "https://i.nsfw.pics/131ed0764342b570a338af37cdd75e3e.jpeg", + "https://i.nsfw.pics/c447389dee315f5960eb29671fb56232.jpeg", + }, + ), + ( + "https://nsfw.pics/image/OdfV", # Single image + {"https://i.nsfw.pics/b8007b506022132fe857eead3dc98a92.gif"}, + ), + ), +) +def test_get_links(test_url: str, expected: set[str]): + results = NsfwPics._get_links(test_url) + assert sorted(results) == sorted(expected) + + +@pytest.mark.online +@pytest.mark.slow +@pytest.mark.parametrize( + ("test_url", "expected_hashes"), + ( + ( + "https://nsfw.pics/album/Test.l2t", # Album + { + "9ceac1e26c4799b0a6b7d5453a73f53b", + "8ff9229c39ad5403e9859a21d5aec103", + "907f92b1c295d5f84f4f64aacc960079", + "1098edadc345ec948d37e1541ed867eb", + "fb60e0a42a0f7f0929f5a5ae401a3518", + }, + ), + ( + "https://nsfw.pics/image/OdfV", # Single image + {"9ceac1e26c4799b0a6b7d5453a73f53b"}, + ), + ), +) +def test_download_resources(test_url: str, expected_hashes: set[str]): + mock_download = Mock() + mock_download.url = test_url + downloader = NsfwPics(mock_download) + results = downloader.find_resources() + assert all(isinstance(res, Resource) for res in results) + [res.download() for res in results] + hashes = {res.hash.hexdigest() for res in results} + assert hashes == set(expected_hashes) From bcce19100bb4ff34adb3f12a3aaf3683bd8e2709 Mon Sep 17 00:00:00 2001 From: Armin Samii Date: Sat, 17 Jun 2023 20:53:02 -0400 Subject: [PATCH 59/82] add test --- tests/archive_entry/test_submission_archive_entry.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/archive_entry/test_submission_archive_entry.py b/tests/archive_entry/test_submission_archive_entry.py index 6cea3f8d..c2ed1240 100644 --- a/tests/archive_entry/test_submission_archive_entry.py +++ b/tests/archive_entry/test_submission_archive_entry.py @@ -16,6 +16,16 @@ def test_get_comments(test_submission_id: str, min_comments: int, reddit_instanc assert len(results) >= min_comments +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.parametrize(("test_submission_id", "min_comments"), (("m3reby", 27),)) +def test_skip_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit): + test_submission = reddit_instance.submission(id=test_submission_id) + test_archive_entry = SubmissionArchiveEntry(test_submission, false) + results = test_archive_entry._get_comments() + assert len(results) == 0 + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.parametrize( From 928aa7e623b12e82dfa8022a3ea947c93adbf978 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 18 Jun 2023 13:11:51 +1000 Subject: [PATCH 60/82] Fix declaration --- bdfr/archive_entry/submission_archive_entry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/archive_entry/submission_archive_entry.py b/bdfr/archive_entry/submission_archive_entry.py index b79b13e2..852cb760 100644 --- a/bdfr/archive_entry/submission_archive_entry.py +++ b/bdfr/archive_entry/submission_archive_entry.py @@ -10,7 +10,7 @@ class SubmissionArchiveEntry(BaseArchiveEntry): - def __init__(self, submission: praw.models.Submission, include_comments: bool) -> None: + def __init__(self, submission: praw.models.Submission, include_comments: bool = True) -> None: super().__init__(submission) self._include_comments = include_comments From fded771d6262b1312599e8dbc1149318b9580d9c Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 18 Jun 2023 13:15:02 +1000 Subject: [PATCH 61/82] Fix test --- tests/archive_entry/test_submission_archive_entry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/archive_entry/test_submission_archive_entry.py b/tests/archive_entry/test_submission_archive_entry.py index c2ed1240..c3e24187 100644 --- a/tests/archive_entry/test_submission_archive_entry.py +++ b/tests/archive_entry/test_submission_archive_entry.py @@ -21,9 +21,9 @@ def test_get_comments(test_submission_id: str, min_comments: int, reddit_instanc @pytest.mark.parametrize(("test_submission_id", "min_comments"), (("m3reby", 27),)) def test_skip_comments(test_submission_id: str, min_comments: int, reddit_instance: praw.Reddit): test_submission = reddit_instance.submission(id=test_submission_id) - test_archive_entry = SubmissionArchiveEntry(test_submission, false) - results = test_archive_entry._get_comments() - assert len(results) == 0 + test_archive_entry = SubmissionArchiveEntry(test_submission, False) + results = test_archive_entry.compile() + assert len(results["comments"]) == 0 @pytest.mark.online From 4eeb4238620a5b1990594e4e4782fa9e5f672696 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 18 Jun 2023 13:18:05 +1000 Subject: [PATCH 62/82] Reformat file --- bdfr/archiver.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 808a9524..69dea170 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -73,7 +73,10 @@ def get_user_data(self) -> list[Iterator]: results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit)) return results - def _pull_lever_entry_factory(self, praw_item: Union[praw.models.Submission, praw.models.Comment]) -> BaseArchiveEntry: + def _pull_lever_entry_factory( + self, + praw_item: Union[praw.models.Submission, praw.models.Comment], + ) -> BaseArchiveEntry: if isinstance(praw_item, praw.models.Submission): return SubmissionArchiveEntry(praw_item, not self.args.skip_comments) elif isinstance(praw_item, praw.models.Comment): From f28f74cfc57d0e19bb0af84982fe333c5ae62a0b Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 18 Jun 2023 13:18:16 +1000 Subject: [PATCH 63/82] Add integration test --- tests/integration_tests/test_archive_integration.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index aa98b2c4..d7bb2000 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -199,3 +199,16 @@ def test_user_serv_fail(test_args: list[str], response: int, tmp_path: Path): result = runner.invoke(cli, test_args) assert result.exit_code == 0 assert f"received {response} HTTP response" in result.output + + +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.parametrize("test_args", (["--skip-comments", "--link", "gxqapql"],)) +def test_cli_archive_skip_comments(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert "Converting comment" not in result.output + assert "Retrieving full comment tree for submission" not in result.output From fb6ae392b1539648a00ee61e7c15609034cecb9a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Thu, 15 Jun 2023 12:52:06 +1000 Subject: [PATCH 64/82] Add instruction for logs to bug reports --- .github/ISSUE_TEMPLATE/bug_report.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b4cae852..bc1c231b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -29,6 +29,8 @@ Paste here the command(s) that causes the bug ## Logs +These are my **unaltered** logs: + ```text Paste the log output here. ``` From 6d0386a2ac253df61d15906b5500b5952a9ed4ea Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Wed, 21 Jun 2023 13:04:50 +1000 Subject: [PATCH 65/82] Fix command name to match documentation Closes #844 --- bdfr/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index 54a943cc..a0379e36 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -171,7 +171,7 @@ def cli_clone(context: click.Context, **_) -> None: logger.info("Program complete - BDFR Cloner v{__version__}") -@cli.command("completion") +@cli.command("completions") @click.argument("shell", type=click.Choice(("all", "bash", "fish", "zsh"), case_sensitive=False), default="all") @click.help_option("-h", "--help") @click.option("-u", "--uninstall", is_flag=True, default=False, help="Uninstall completion") From c993e0163496bfb55c1ad4b8952524d819a1f45a Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Sat, 24 Jun 2023 16:42:36 -0400 Subject: [PATCH 66/82] Test fixes --- .../test_download_integration.py | 20 +++++++++---------- tests/site_downloaders/test_gallery.py | 8 +++----- tests/test_configuration.py | 2 +- tests/test_connector.py | 4 ++-- tests/test_downloader.py | 10 +++++----- tests/test_file_name_formatter.py | 14 ++++++------- tests/yaml_test_configuration.yaml | 2 +- 7 files changed, 29 insertions(+), 31 deletions(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index 188d5528..d14920f5 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -39,16 +39,16 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): @pytest.mark.parametrize( "test_args", ( - ["-s", "Mindustry", "-L", 3], - ["-s", "r/Mindustry", "-L", 3], - ["-s", "r/mindustry", "-L", 3], - ["-s", "mindustry", "-L", 3], + ["-s", "EmpireDidNothingWrong", "-L", 3], + ["-s", "r/EmpireDidNothingWrong", "-L", 3], + ["-s", "r/EmpireDidNothingWrong", "-L", 3], + ["-s", "EmpireDidNothingWrong", "-L", 3], ["-s", "https://www.reddit.com/r/TrollXChromosomes/", "-L", 3], ["-s", "r/TrollXChromosomes/", "-L", 3], ["-s", "TrollXChromosomes/", "-L", 3], ["-s", "trollxchromosomes", "-L", 3], - ["-s", "trollxchromosomes,mindustry,python", "-L", 3], - ["-s", "trollxchromosomes, mindustry, python", "-L", 3], + ["-s", "trollxchromosomes,EmpireDidNothingWrong,python", "-L", 3], + ["-s", "trollxchromosomes, EmpireDidNothingWrong, python", "-L", 3], ["-s", "trollxchromosomes", "-L", 3, "--time", "day"], ["-s", "trollxchromosomes", "-L", 3, "--sort", "new"], ["-s", "trollxchromosomes", "-L", 3, "--time", "day", "--sort", "new"], @@ -388,10 +388,10 @@ def test_cli_download_ignore_user(test_args: list[str], tmp_path: Path): @pytest.mark.parametrize( ("test_args", "was_filtered"), ( - (["-l", "ljyy27", "--min-score", "50"], True), - (["-l", "ljyy27", "--min-score", "1"], False), - (["-l", "ljyy27", "--max-score", "1"], True), - (["-l", "ljyy27", "--max-score", "100"], False), + (["-l", "w22m5l", "--min-score", "50000"], True), + (["-l", "w22m5l", "--min-score", "1"], False), + (["-l", "w22m5l", "--max-score", "1"], True), + (["-l", "w22m5l", "--max-score", "50000"], False), ), ) def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp_path: Path): diff --git a/tests/site_downloaders/test_gallery.py b/tests/site_downloaders/test_gallery.py index cf0d3711..c0a4b9ee 100644 --- a/tests/site_downloaders/test_gallery.py +++ b/tests/site_downloaders/test_gallery.py @@ -61,12 +61,10 @@ def test_gallery_get_links(test_ids: list[dict], expected: set[str]): }, ), ( - "ljyy27", + "w22m5l", { - "359c203ec81d0bc00e675f1023673238", - "79262fd46bce5bfa550d878a3b898be4", - "808c35267f44acb523ce03bfa5687404", - "ec8b65bdb7f1279c4b3af0ea2bbb30c3", + "26aa07eed6dd0bd0ec871a9dcdd572ef", + "7e8d2dc005b1270947a0cef4cd64238f", }, ), ( diff --git a/tests/test_configuration.py b/tests/test_configuration.py index b071a83a..7694e3db 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -30,6 +30,6 @@ def test_yaml_file_read(): file = "./tests/yaml_test_configuration.yaml" test_config = Configuration() test_config.parse_yaml_options(file) - assert test_config.subreddit == ["EarthPorn", "TwoXChromosomes", "Mindustry"] + assert test_config.subreddit == ["EarthPorn", "TwoXChromosomes", "EmpireDidNothingWrong"] assert test_config.sort == "new" assert test_config.limit == 10 diff --git a/tests/test_connector.py b/tests/test_connector.py index 55ac6bfa..03fe01ff 100644 --- a/tests/test_connector.py +++ b/tests/test_connector.py @@ -188,7 +188,7 @@ def test_get_submissions_from_link( ("test_subreddits", "limit", "sort_type", "time_filter", "max_expected_len"), ( (("Futurology",), 10, "hot", "all", 10), - (("Futurology", "Mindustry, Python"), 10, "hot", "all", 30), + (("Futurology", "EmpireDidNothingWrong, Python"), 10, "hot", "all", 30), (("Futurology",), 20, "hot", "all", 20), (("Futurology", "Python"), 10, "hot", "all", 20), (("Futurology",), 100, "hot", "all", 100), @@ -517,7 +517,7 @@ def test_check_subreddit_status_bad(test_subreddit_name: str, expected_message: "test_subreddit_name", ( "Python", - "Mindustry", + "EmpireDidNothingWrong", "TrollXChromosomes", "all", ), diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 2b17eb02..1755d5ba 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -174,7 +174,7 @@ def test_download_submission_file_exists( @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(("test_submission_id", "expected_files_len"), (("ljyy27", 4),)) +@pytest.mark.parametrize(("test_submission_id", "expected_files_len"), (("w22m5l", 2),)) def test_download_submission( test_submission_id: str, expected_files_len: int, @@ -195,7 +195,7 @@ def test_download_submission( @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(("test_submission_id", "min_score"), (("ljyy27", 1),)) +@pytest.mark.parametrize(("test_submission_id", "min_score"), (("w22m5l", 1),)) def test_download_submission_min_score_above( test_submission_id: str, min_score: int, @@ -219,7 +219,7 @@ def test_download_submission_min_score_above( @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(("test_submission_id", "min_score"), (("ljyy27", 25),)) +@pytest.mark.parametrize(("test_submission_id", "min_score"), (("w22m5l", 50000),)) def test_download_submission_min_score_below( test_submission_id: str, min_score: int, @@ -243,7 +243,7 @@ def test_download_submission_min_score_below( @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(("test_submission_id", "max_score"), (("ljyy27", 25),)) +@pytest.mark.parametrize(("test_submission_id", "max_score"), (("w22m5l", 50000),)) def test_download_submission_max_score_below( test_submission_id: str, max_score: int, @@ -267,7 +267,7 @@ def test_download_submission_max_score_below( @pytest.mark.online @pytest.mark.reddit -@pytest.mark.parametrize(("test_submission_id", "max_score"), (("ljyy27", 1),)) +@pytest.mark.parametrize(("test_submission_id", "max_score"), (("w22m5l", 1),)) def test_download_submission_max_score_above( test_submission_id: str, max_score: int, diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index 7afc2b8a..daf6526d 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -336,10 +336,10 @@ def test_strip_emojies(test_string: str, expected: str): ("test_submission_id", "expected"), ( ( - "mfuteh", + "718ifq", { - "title": "Why Do Interviewers Ask Linked List Questions?", - "redditor": "mjgardner", + "title": "Wood Stormtrooper Carving", + "redditor": "deathakissaway", }, ), ), @@ -357,11 +357,11 @@ def test_generate_dict_for_submission(test_submission_id: str, expected: dict, r ("test_comment_id", "expected"), ( ( - "gsq0yuw", + "dn8xwh1", { - "title": "Why Do Interviewers Ask Linked List Questions?", - "redditor": "Doctor-Dapper", - "postid": "gsq0yuw", + "title": "Wood Stormtrooper Carving", + "redditor": "lemonman37", + "postid": "dn8xwh1", "flair": "", }, ), diff --git a/tests/yaml_test_configuration.yaml b/tests/yaml_test_configuration.yaml index 5621721d..e9819883 100644 --- a/tests/yaml_test_configuration.yaml +++ b/tests/yaml_test_configuration.yaml @@ -3,4 +3,4 @@ sort: new subreddit: - EarthPorn - TwoXChromosomes - - Mindustry \ No newline at end of file + - EmpireDidNothingWrong \ No newline at end of file From ca2408b6e11b6b66a511ec400514cf0ed3f92d04 Mon Sep 17 00:00:00 2001 From: OMEGARAZER <869111+OMEGARAZER@users.noreply.github.com> Date: Tue, 20 Jun 2023 17:48:28 -0400 Subject: [PATCH 67/82] RUF013 --- bdfr/download_filter.py | 7 ++++++- bdfr/resource.py | 6 +++++- bdfr/site_downloaders/base_downloader.py | 15 ++++++++++++--- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/bdfr/download_filter.py b/bdfr/download_filter.py index 0e6f1c6b..a1953692 100644 --- a/bdfr/download_filter.py +++ b/bdfr/download_filter.py @@ -2,6 +2,7 @@ import logging import re +from typing import Optional from bdfr.resource import Resource @@ -9,7 +10,11 @@ class DownloadFilter: - def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None) -> None: + def __init__( + self, + excluded_extensions: Optional[list[str]] = None, + excluded_domains: Optional[list[str]] = None, + ) -> None: self.excluded_extensions = excluded_extensions self.excluded_domains = excluded_domains diff --git a/bdfr/resource.py b/bdfr/resource.py index 375e6ce2..d9f07146 100644 --- a/bdfr/resource.py +++ b/bdfr/resource.py @@ -19,7 +19,11 @@ class Resource: def __init__( - self, source_submission: Submission, url: str, download_function: Callable, extension: str = None + self, + source_submission: Submission, + url: str, + download_function: Callable, + extension: Optional[str] = None, ) -> None: self.source_submission = source_submission self.content: Optional[bytes] = None diff --git a/bdfr/site_downloaders/base_downloader.py b/bdfr/site_downloaders/base_downloader.py index 98d67073..8a8d7a1a 100644 --- a/bdfr/site_downloaders/base_downloader.py +++ b/bdfr/site_downloaders/base_downloader.py @@ -25,7 +25,11 @@ def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> l raise NotImplementedError @staticmethod - def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: + def retrieve_url( + url: str, + cookies: Optional[dict] = None, + headers: Optional[dict] = None, + ) -> requests.Response: try: res = requests.get(url, cookies=cookies, headers=headers, timeout=10) except requests.exceptions.RequestException as e: @@ -39,7 +43,12 @@ def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> reques return res @staticmethod - def post_url(url: str, cookies: dict = None, headers: dict = None, payload: dict = None) -> requests.Response: + def post_url( + url: str, + cookies: Optional[dict] = None, + headers: Optional[dict] = None, + payload: Optional[dict] = None, + ) -> requests.Response: try: res = requests.post(url, cookies=cookies, headers=headers, json=payload, timeout=10) except requests.exceptions.RequestException as e: @@ -53,7 +62,7 @@ def post_url(url: str, cookies: dict = None, headers: dict = None, payload: dict return res @staticmethod - def head_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response: + def head_url(url: str, cookies: Optional[dict] = None, headers: Optional[dict] = None) -> requests.Response: try: res = requests.head(url, cookies=cookies, headers=headers, timeout=10) except requests.exceptions.RequestException as e: From b97dacb2f3b932f2fe806647c2363530d32d244a Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 25 Jun 2023 13:25:01 +1000 Subject: [PATCH 68/82] Allow archiver format to be specified multiple times --- README.md | 1 + bdfr/__main__.py | 2 +- bdfr/archiver.py | 17 +++++++++-------- bdfr/configuration.py | 4 +++- .../test_archive_integration.py | 16 ++++++++++++++++ 5 files changed, 30 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5df4f7f2..aeb28b22 100644 --- a/README.md +++ b/README.md @@ -289,6 +289,7 @@ The following options are for the `archive` command specifically. - `json` (default) - `xml` - `yaml` + - Can be specified multiple times - `--comment-context` - This option will, instead of downloading an individual comment, download the submission that comment is a part of - May result in a longer run time as it retrieves much more data diff --git a/bdfr/__main__.py b/bdfr/__main__.py index a0379e36..c263769c 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -65,7 +65,7 @@ _archiver_options = [ click.option("--all-comments", is_flag=True, default=None), click.option("--comment-context", is_flag=True, default=None), - click.option("-f", "--format", type=click.Choice(("xml", "json", "yaml")), default=None), + click.option("-f", "--format", type=click.Choice(("xml", "json", "yaml")), default=None, multiple=True), click.option("--skip-comments", is_flag=True, default=None), ] diff --git a/bdfr/archiver.py b/bdfr/archiver.py index 69dea170..a6b014ea 100644 --- a/bdfr/archiver.py +++ b/bdfr/archiver.py @@ -89,14 +89,15 @@ def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comme logger.debug(f"Converting comment {praw_item.id} to submission {praw_item.submission.id}") praw_item = praw_item.submission archive_entry = self._pull_lever_entry_factory(praw_item) - if self.args.format == "json": - self._write_entry_json(archive_entry) - elif self.args.format == "xml": - self._write_entry_xml(archive_entry) - elif self.args.format == "yaml": - self._write_entry_yaml(archive_entry) - else: - raise ArchiverError(f"Unknown format {self.args.format!r} given") + for format_specification in self.args.format: + if format_specification == "json": + self._write_entry_json(archive_entry) + elif format_specification == "xml": + self._write_entry_xml(archive_entry) + elif format_specification == "yaml": + self._write_entry_yaml(archive_entry) + else: + raise ArchiverError(f"Unknown format {self.args.format!r} given") logger.info(f"Record for entry item {praw_item.id} written to disk") def _write_entry_json(self, entry: BaseArchiveEntry) -> None: diff --git a/bdfr/configuration.py b/bdfr/configuration.py index e9f3a2b1..4d2ca188 100644 --- a/bdfr/configuration.py +++ b/bdfr/configuration.py @@ -56,7 +56,9 @@ def __init__(self) -> None: # Archiver-specific options self.all_comments = False - self.format = "json" + self.format = [ + "json", + ] self.comment_context: bool = False self.skip_comments = False diff --git a/tests/integration_tests/test_archive_integration.py b/tests/integration_tests/test_archive_integration.py index d7bb2000..423d1f98 100644 --- a/tests/integration_tests/test_archive_integration.py +++ b/tests/integration_tests/test_archive_integration.py @@ -52,6 +52,22 @@ def test_cli_archive_single(test_args: list[str], tmp_path: Path): assert re.search(r"Writing entry .*? to file in .*? format", result.output) +@pytest.mark.online +@pytest.mark.reddit +@pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") +@pytest.mark.parametrize( + "test_args", + (["-l", "m2601g", "-f", "yaml", "-f", "json"],), +) +def test_cli_archive_single_multi_format(test_args: list[str], tmp_path: Path): + runner = CliRunner() + test_args = create_basic_args_for_archive_runner(test_args, tmp_path) + result = runner.invoke(cli, test_args) + assert result.exit_code == 0 + assert re.search(r"Writing entry .*? to file in YAML format", result.output) + assert re.search(r"Writing entry .*? to file in JSON format", result.output) + + @pytest.mark.online @pytest.mark.reddit @pytest.mark.skipif(not does_test_config_exist, reason="A test config file is required for integration tests") From 3584ff19535a8b6b7d7d58e47fd7cd3b5962e41f Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 25 Jun 2023 14:22:42 +1000 Subject: [PATCH 69/82] Format README according to line length --- .markdown_style.rb | 2 +- README.md | 221 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 168 insertions(+), 55 deletions(-) diff --git a/.markdown_style.rb b/.markdown_style.rb index 32ee0b1f..f59fa56f 100644 --- a/.markdown_style.rb +++ b/.markdown_style.rb @@ -1,4 +1,4 @@ all -exclude_tag :line_length +rule 'MD013', :line_length => 120, :code_blocks => false rule 'MD007', :indent => 4 rule 'MD029', :style => 'ordered' diff --git a/README.md b/README.md index aeb28b22..3a671e5b 100644 --- a/README.md +++ b/README.md @@ -8,15 +8,20 @@ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=Python)](https://github.com/psf/black) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) -This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. [List of currently supported sources](#list-of-currently-supported-sources) +This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl +Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive +command-line interface. [List of currently supported sources](#list-of-currently-supported-sources) -If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure that your issue is clear and contains everything it needs to for the developers to investigate. +If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure +that your issue is clear and contains everything it needs to for the developers to investigate. -Included in this README are a few example Bash tricks to get certain behaviour. For that, see [Common Command Tricks](#common-command-tricks). +Included in this README are a few example Bash tricks to get certain behaviour. For that, see [Common Command +Tricks](#common-command-tricks). ## Installation -*Bulk Downloader for Reddit* needs Python version 3.9 or above. Please update Python before installation to meet the requirement. +*Bulk Downloader for Reddit* needs Python version 3.9 or above. Please update Python before installation to meet the +requirement. Then, you can install it via pip with: @@ -45,15 +50,24 @@ If on Arch Linux or derivative operating systems such as Manjaro, the BDFR can b ### Source code -If you want to use the source code or make contributions, refer to [CONTRIBUTING](docs/CONTRIBUTING.md#preparing-the-environment-for-development) +If you want to use the source code or make contributions, refer to +[CONTRIBUTING](docs/CONTRIBUTING.md#preparing-the-environment-for-development) ## Usage -The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user. +The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These +sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded +to disk, according to a naming and organisational scheme defined by the user. -There are three modes to the BDFR: download, archive, and clone. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. Lastly, the `clone` command will perform both functions of the previous commands at once and is more efficient than running those commands sequentially. +There are three modes to the BDFR: download, archive, and clone. Each one has a command that performs similar but +distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the +images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission +details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup +language form, such as JSON, XML, or YAML. Lastly, the `clone` command will perform both functions of the previous +commands at once and is more efficient than running those commands sequentially. -Note that the `clone` command is not a true, failthful clone of Reddit. It simply retrieves much of the raw data that Reddit provides. To get a true clone of Reddit, another tool such as HTTrack should be used. +Note that the `clone` command is not a true, failthful clone of Reddit. It simply retrieves much of the raw data that +Reddit provides. To get a true clone of Reddit, another tool such as HTTrack should be used. After installation, run the program from any directory as shown below: @@ -69,7 +83,8 @@ bdfr archive bdfr clone ``` -However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. Don't forget that some parameters can be provided multiple times. Some quick reference commands are: +However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. +Don't forget that some parameters can be provided multiple times. Some quick reference commands are: ```bash bdfr download ./path/to/output --subreddit Python -L 10 @@ -121,7 +136,8 @@ bdfr download ./path/to/output --skip mp4 --skip avi --file-scheme "{UPVOTES}_{R Any option that can be specified multiple times should be formatted like subreddit is above. -In case when the same option is specified both in the YAML file and in as a command line argument, the command line argument takes priority +In case when the same option is specified both in the YAML file and in as a command line argument, the command line +argument takes priority ## Options @@ -175,8 +191,10 @@ The following options are common between both the `archive` and `download` comma - `-L, --limit` - This is the limit on the number of submissions retrieve - Default is max possible - - Note that this limit applies to **each source individually** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped - - If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We cannot bypass this.** + - Note that this limit applies to **each source individually** e.g. if a `--limit` of 10 and three subreddits are + provided, then 30 total submissions will be scraped + - If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We + cannot bypass this.** - `-S, --sort` - This is the sort type for each applicable submission source supplied to the BDFR - This option does not apply to upvoted, downvoted or saved posts when scraping from these sources @@ -214,7 +232,8 @@ The following options are common between both the `archive` and `download` comma - `year` - `--time-format` - This specifies the format of the datetime string that replaces `{DATE}` in file and folder naming schemes - - See [Time Formatting Customisation](#time-formatting-customisation) for more details, and the formatting scheme + - See [Time Formatting Customisation](#time-formatting-customisation) for more details, and the formatting + scheme - `-u, --user` - This specifies the user to scrape in concert with other options - When using `--authenticate`, `--user me` can be used to refer to the authenticated user @@ -226,7 +245,8 @@ The following options are common between both the `archive` and `download` comma ### Downloader Options -The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory. +The following options apply only to the `download` command. This command downloads the files and resources linked to in +the submission, or a text submission itself, to the disk in the specified directory. - `--make-hard-links` - This flag will create hard links to an existing file when a duplicate is downloaded in the current run @@ -240,7 +260,8 @@ The following options apply only to the `download` command. This command downloa - This is calculated by MD5 hash - `--search-existing` - This will make the BDFR compile the hashes for every file in `directory` - - The hashes are used to skip duplicate files if `--no-dupes` is supplied or make hard links if `--make-hard-links` is supplied + - The hashes are used to skip duplicate files if `--no-dupes` is supplied or make hard links if `--make-hard-links` + is supplied - **The use of this option is highly discouraged due to inefficiency** - `--file-scheme` - Sets the scheme for files @@ -262,7 +283,8 @@ The following options apply only to the `download` command. This command downloa - Can be specified multiple times - Domains must be supplied in the form `example.com` or `img.example.com` - `--skip` - - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded + - This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be + downloaded - Can be specified multiple times - `--skip-subreddit` - This skips all submissions from the specified subreddit @@ -299,11 +321,13 @@ The following options are for the `archive` command specifically. ### Cloner Options -The `clone` command can take all the options listed above for both the `archive` and `download` commands since it performs the functions of both. +The `clone` command can take all the options listed above for both the `archive` and `download` commands since it +performs the functions of both. ## Common Command Tricks -A common use case is for subreddits/users to be loaded from a file. The BDFR supports this via YAML file options (`--opts my_opts.yaml`). +A common use case is for subreddits/users to be loaded from a file. The BDFR supports this via YAML file options +(`--opts my_opts.yaml`). Alternatively, you can use the command-line [xargs](https://en.wikipedia.org/wiki/Xargs) function. For a list of users `users.txt` (one user per line), type: @@ -312,27 +336,46 @@ For a list of users `users.txt` (one user per line), type: cat users.txt | xargs -L 1 echo --user | xargs -L 50 bdfr download ``` -The part `-L 50` is to make sure that the character limit for a single line isn't exceeded, but may not be necessary. This can also be used to load subreddits from a file, simply exchange `--user` with `--subreddit` and so on. +The part `-L 50` is to make sure that the character limit for a single line isn't exceeded, but may not be necessary. +This can also be used to load subreddits from a file, simply exchange `--user` with `--subreddit` and so on. ## Authentication and Security -The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, downvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, downvoted posts, saved posts, and private multireddits. +The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, +token-based system for making requests. This also means that the BDFR only has access to specific parts of the account +authenticated, by default only saved posts, upvoted posts, downvoted posts, and the identity of the authenticated +account. Note that authentication is not required unless accessing private things like upvoted posts, downvoted posts, +saved posts, and private multireddits. -To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Read this and **confirm that there are no more permissions than needed to run the program**. You should not grant unneeded permissions; by default, the BDFR only requests permission to read your saved, upvoted, or downvoted submissions and identify as you. +To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been +a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This +is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to +Reddit, where the permissions being requested will be shown. Read this and **confirm that there are no more permissions +than needed to run the program**. You should not grant unneeded permissions; by default, the BDFR only requests +permission to read your saved, upvoted, or downvoted submissions and identify as you. -If the permissions look safe, confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on. +If the permissions look safe, confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit +from then on. ## Changing Permissions -Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it *is* allowed by the BDFR. +Most users will not need to do anything extra to use any of the current features. However, if additional features such +as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced +users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it *is* +allowed by the BDFR. -The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your token were to be compromised, however unlikely that actually is. Never grant more permissions than you absolutely need. +The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request +when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do +so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your +token were to be compromised, however unlikely that actually is. Never grant more permissions than you absolutely need. For more details on the configuration file and the values therein, see [Configuration Files](#configuration). ## Folder and File Name Schemes -The naming and folder schemes for the BDFR are both completely customisable. A number of different fields can be given which will be replaced with properties from a submission when downloading it. The scheme format takes the form of `{KEY}`, where `KEY` is a string from the below list. +The naming and folder schemes for the BDFR are both completely customisable. A number of different fields can be given +which will be replaced with properties from a submission when downloading it. The scheme format takes the form of +`{KEY}`, where `KEY` is a string from the below list. - `DATE` - `FLAIR` @@ -342,22 +385,32 @@ The naming and folder schemes for the BDFR are both completely customisable. A n - `TITLE` - `UPVOTES` -Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every downloaded post with the unique submission ID, you can use `{POSTID}`. Static strings can also be included, such as `download_{POSTID}` which will not change from submission to submission. For example, the previous string will result in the following submission file names: +Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every +downloaded post with the unique submission ID, you can use `{POSTID}`. Static strings can also be included, such as +`download_{POSTID}` which will not change from submission to submission. For example, the previous string will result in +the following submission file names: - `download_aaaaaa.png` - `download_bbbbbb.png` -At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. In both cases, there will be no separation between all submissions. +At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, +can be null or a simple static string. In the former case, all files will be placed in the folder specified with the +`directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that +name. In both cases, there will be no separation between all submissions. -It is highly recommended that the file name scheme contain the parameter `{POSTID}` as this is **the only parameter guaranteed to be unique**. No combination of other keys will necessarily be unique and may result in posts being skipped as the BDFR will see files by the same name and skip the download, assuming that they are already downloaded. +It is highly recommended that the file name scheme contain the parameter `{POSTID}` as this is **the only parameter +guaranteed to be unique**. No combination of other keys will necessarily be unique and may result in posts being skipped +as the BDFR will see files by the same name and skip the download, assuming that they are already downloaded. ## Configuration -The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be: +The configuration files are, by default, stored in the configuration directory for the user. This differs depending on +the OS that the BDFR is being run on. For Windows, this will be: - `C:\Users\\AppData\Local\BDFR\bdfr` -If Python has been installed through the Windows Store, the folder will appear in a different place. Note that the hash included in the file path may change from installation to installation. +If Python has been installed through the Windows Store, the folder will appear in a different place. Note that the hash +included in the file path may change from installation to installation. - `C:\Users\\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\Local\BDFR\bdfr` @@ -369,11 +422,13 @@ Lastly, on a Linux system, this will be: - `~/.config/bdfr/` -The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report. +The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to +submit a bug, it is this file that you will need to submit with the report. ### Configuration File -The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys **must** be included in the configuration file supplied. +The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys +**must** be included in the configuration file supplied. - `client_id` - `client_secret` @@ -387,21 +442,39 @@ The following keys are optional, and defaults will be used if they cannot be fou - `disabled_modules` - `filename-restriction-scheme` -All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default. +All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to +function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the +configuration directory as the default. -Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, increase this number. +Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to +do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow +quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs +will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any +runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, +increase this number. #### Time Formatting Customisation -The option `time_format` will specify the format of the timestamp that replaces `{DATE}` in filename and folder name schemes. By default, this is the [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format which is highly recommended due to its standardised nature. If you don't **need** to change it, it is recommended that you do not. However, you can specify it to anything required with this option. The `--time-format` option supersedes any specification in the configuration file +The option `time_format` will specify the format of the timestamp that replaces `{DATE}` in filename and folder name +schemes. By default, this is the [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format which is highly recommended +due to its standardised nature. If you don't **need** to change it, it is recommended that you do not. However, you can +specify it to anything required with this option. The `--time-format` option supersedes any specification in the +configuration file -The format can be specified through the [format codes](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) that are standard in the Python `datetime` library. +The format can be specified through the [format +codes](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) that are standard in the Python +`datetime` library. #### Disabling Modules -The individual modules of the BDFR, used to download submissions from websites, can be disabled. This is helpful especially in the case of the fallback downloaders, since the `--skip-domain` option cannot be effectively used in these cases. For example, the Youtube-DL downloader can retrieve data from hundreds of websites and domains; thus the only way to fully disable it is via the `--disable-module` option. +The individual modules of the BDFR, used to download submissions from websites, can be disabled. This is helpful +especially in the case of the fallback downloaders, since the `--skip-domain` option cannot be effectively used in these +cases. For example, the Youtube-DL downloader can retrieve data from hundreds of websites and domains; thus the only way +to fully disable it is via the `--disable-module` option. -Modules can be disabled through the command line interface for the BDFR or more permanently in the configuration file via the `disabled_modules` option. The list of downloaders that can be disabled are the following. Note that they are case-insensitive. +Modules can be disabled through the command line interface for the BDFR or more permanently in the configuration file +via the `disabled_modules` option. The list of downloaders that can be disabled are the following. Note that they are +case-insensitive. - `Direct` - `DelayForReddit` @@ -419,37 +492,75 @@ Modules can be disabled through the command line interface for the BDFR or more ### Rate Limiting -The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases. - -To this end, the BDFR will sleep for a time before retrying the download, giving the remote server time to "rest". This is done in 60 second increments. For example, if a rate-limiting-related error is given, the BDFR will sleep for 60 seconds before retrying. Then, if the same type of error occurs, it will sleep for another 120 seconds, then 180 seconds, and so on. - -The option `--max-wait-time` and the configuration option `max_wait_time` both specify the maximum time the BDFR will wait. If both are present, the command-line option takes precedence. For instance, the default is 120, so the BDFR will wait for 60 seconds, then 120 seconds, and then move one. **Note that this results in a total time of 180 seconds trying the same download**. If you wish to try to bypass the rate-limiting system on the remote site, increasing the maximum wait time may help. However, note that the actual wait times increase exponentially if the resource is not downloaded i.e. specifying a max value of 300 (5 minutes), can make the BDFR pause for 15 minutes on one submission, not 5, in the worst case. +The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of +requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so +many requests that the remote website cuts the client off to preserve the function of the site. This is a common +situation when downloading many resources from the same site. It is polite and best practice to obey the website's +wishes in these cases. + +To this end, the BDFR will sleep for a time before retrying the download, giving the remote server time to "rest". This +is done in 60 second increments. For example, if a rate-limiting-related error is given, the BDFR will sleep for 60 +seconds before retrying. Then, if the same type of error occurs, it will sleep for another 120 seconds, then 180 +seconds, and so on. + +The option `--max-wait-time` and the configuration option `max_wait_time` both specify the maximum time the BDFR will +wait. If both are present, the command-line option takes precedence. For instance, the default is 120, so the BDFR will +wait for 60 seconds, then 120 seconds, and then move one. **Note that this results in a total time of 180 seconds trying +the same download**. If you wish to try to bypass the rate-limiting system on the remote site, increasing the maximum +wait time may help. However, note that the actual wait times increase exponentially if the resource is not downloaded +i.e. specifying a max value of 300 (5 minutes), can make the BDFR pause for 15 minutes on one submission, not 5, in the +worst case. ## Multiple Instances -The BDFR can be run in multiple instances with multiple configurations, either concurrently or consecutively. The use of scripting files facilitates this the easiest, either Powershell on Windows operating systems or Bash elsewhere. This allows multiple scenarios to be run with data being scraped from different sources, as any two sets of scenarios might be mutually exclusive i.e. it is not possible to download any combination of data from a single run of the BDFR. To download from multiple users for example, multiple runs of the BDFR are required. +The BDFR can be run in multiple instances with multiple configurations, either concurrently or consecutively. The use of +scripting files facilitates this the easiest, either Powershell on Windows operating systems or Bash elsewhere. This +allows multiple scenarios to be run with data being scraped from different sources, as any two sets of scenarios might +be mutually exclusive i.e. it is not possible to download any combination of data from a single run of the BDFR. To +download from multiple users for example, multiple runs of the BDFR are required. -Running these scenarios consecutively is done easily, like any single run. Configuration files that differ may be specified with the `--config` option to switch between tokens, for example. Otherwise, almost all configuration for data sources can be specified per-run through the command line. +Running these scenarios consecutively is done easily, like any single run. Configuration files that differ may be +specified with the `--config` option to switch between tokens, for example. Otherwise, almost all configuration for data +sources can be specified per-run through the command line. -Running scenarios concurrently (at the same time) however, is more complicated. The BDFR will look to a single, static place to put the detailed log files, in a directory with the configuration file specified above. If there are multiple instances, or processes, of the BDFR running at the same time, they will all be trying to write to a single file. On Linux and other UNIX based operating systems, this will succeed, though there is a substantial risk that the logfile will be useless due to garbled and jumbled data. On Windows however, attempting this will raise an error that crashes the program as Windows forbids multiple processes from accessing the same file. +Running scenarios concurrently (at the same time) however, is more complicated. The BDFR will look to a single, static +place to put the detailed log files, in a directory with the configuration file specified above. If there are multiple +instances, or processes, of the BDFR running at the same time, they will all be trying to write to a single file. On +Linux and other UNIX based operating systems, this will succeed, though there is a substantial risk that the logfile +will be useless due to garbled and jumbled data. On Windows however, attempting this will raise an error that crashes +the program as Windows forbids multiple processes from accessing the same file. -The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given location is unique to each instance of the BDFR, then it will run fine. +The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given +location is unique to each instance of the BDFR, then it will run fine. ## Filesystem Restrictions -Different filesystems have different restrictions for what files and directories can be named. Thesse are separated into two broad categories: Linux-based filesystems, which have very few restrictions; and Windows-based filesystems, which are much more restrictive in terms if forbidden characters and length of paths. +Different filesystems have different restrictions for what files and directories can be named. Thesse are separated into +two broad categories: Linux-based filesystems, which have very few restrictions; and Windows-based filesystems, which +are much more restrictive in terms if forbidden characters and length of paths. -During the normal course of operation, the BDFR detects what filesystem it is running on and formats any filenames and directories to conform to the rules that are expected of it. However, there are cases where this will fail. When running on a Linux-based machine, or another system where the home filesystem is permissive, and accessing a share or drive with a less permissive system, the BDFR will assume that the *home* filesystem's rules apply. For example, when downloading to a SAMBA share from Ubuntu, there will be errors as SAMBA is more restrictive than Ubuntu. +During the normal course of operation, the BDFR detects what filesystem it is running on and formats any filenames and +directories to conform to the rules that are expected of it. However, there are cases where this will fail. When running +on a Linux-based machine, or another system where the home filesystem is permissive, and accessing a share or drive with +a less permissive system, the BDFR will assume that the *home* filesystem's rules apply. For example, when downloading +to a SAMBA share from Ubuntu, there will be errors as SAMBA is more restrictive than Ubuntu. -The best option would be to always download to a filesystem that is as permission as possible, such as an NFS share or ext4 drive. However, when this is not possible, the BDFR allows for the restriction scheme to be manually specified at either the command-line or in the configuration file. At the command-line, this is done with `--filename-restriction-scheme windows`, or else an option by the same name in the configuration file. +The best option would be to always download to a filesystem that is as permission as possible, such as an NFS share or +ext4 drive. However, when this is not possible, the BDFR allows for the restriction scheme to be manually specified at +either the command-line or in the configuration file. At the command-line, this is done with +`--filename-restriction-scheme windows`, or else an option by the same name in the configuration file. ## Manipulating Logfiles -The logfiles that the BDFR outputs are consistent and quite detailed and in a format that is amenable to regex. To this end, a number of bash scripts have been [included here](./scripts). They show examples for how to extract successfully downloaded IDs, failed IDs, and more besides. +The logfiles that the BDFR outputs are consistent and quite detailed and in a format that is amenable to regex. To this +end, a number of bash scripts have been [included here](./scripts). They show examples for how to extract successfully +downloaded IDs, failed IDs, and more besides. ## Unsaving posts -Back in v1 there was an option to unsave posts from your account when downloading, but it was removed from the core BDFR on v2 as it is considered a read-only tool. However, for those missing this functionality, a script was created that uses the log files to achieve this. There is info on how to use this on the README.md file on the scripts subdirectory. +Back in v1 there was an option to unsave posts from your account when downloading, but it was removed from the core BDFR +on v2 as it is considered a read-only tool. However, for those missing this functionality, a script was created that +uses the log files to achieve this. There is info on how to use this on the README.md file on the scripts subdirectory. ## List of currently supported sources @@ -465,10 +576,12 @@ Back in v1 there was an option to unsave posts from your account when downloadin - Redgifs - Vidble - YouTube - - Any source supported by [YT-DLP](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md) should be compatable + - Any source supported by [YT-DLP](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md) should be + compatible ## Contributing If you wish to contribute, see [Contributing](docs/CONTRIBUTING.md) for more information. -When reporting any issues or interacting with the developers, please follow the [Code of Conduct](docs/CODE_OF_CONDUCT.md). +When reporting any issues or interacting with the developers, please follow the [Code of +Conduct](docs/CODE_OF_CONDUCT.md). From 86101a27b26371f717d1a5300ddc3cf90726d5b5 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 25 Jun 2023 14:26:12 +1000 Subject: [PATCH 70/82] Format documents according to line length --- docs/ARCHITECTURE.md | 88 ++++++++++++++++++++++++++-------- docs/CODE_OF_CONDUCT.md | 65 ++++++++++--------------- docs/CONTRIBUTING.md | 102 +++++++++++++++++++++++++++++----------- 3 files changed, 168 insertions(+), 87 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 8fc4e139..d7aa99ea 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,41 +1,89 @@ # Architecture -When the project was rewritten for v2, the goal was to make the codebase easily extensible and much easier to read and modify. However, this document provides a step-by-step look through the process that the BDFR goes through, so that any prospective developers can more easily grasp the way the code works. +When the project was rewritten for v2, the goal was to make the codebase easily +extensible and much easier to read and modify. However, this document provides +a step-by-step look through the process that the BDFR goes through, so that any +prospective developers can more easily grasp the way the code works. ## Design Ethos -The BDFR is designed to be a stateless downloader. This means that the state of the program is forgotten between each run of the program. There are no central lists, databases, or indices, that the BDFR uses, only the actual files on disk. There are several advantages to this approach: +The BDFR is designed to be a stateless downloader. This means that the state of +the program is forgotten between each run of the program. There are no central +lists, databases, or indices, that the BDFR uses, only the actual files on +disk. There are several advantages to this approach: -1. There is no chance of the database being corrupted or changed by something other than the BDFR, rendering the BDFR's "idea" of the archive wrong or incomplete. -2. Any information about the archive is contained by the archive itself i.e. for a list of all submission IDs in the archive, this can be extracted from the names of the files in said archive, assuming an appropriate naming scheme was used. -3. Archives can be merged, split, or editing without worrying about having to update a central database -4. There are no versioning issues between updates of the BDFR, where old version are stuck with a worse form of the database -5. An archive can be put on a USB, moved to another computer with possibly a very different BDFR version, and work completely fine +1. There is no chance of the database being corrupted or changed by something + other than the BDFR, rendering the BDFR's "idea" of the archive wrong or + incomplete. +2. Any information about the archive is contained by the archive itself i.e. + for a list of all submission IDs in the archive, this can be extracted from + the names of the files in said archive, assuming an appropriate naming + scheme was used. +3. Archives can be merged, split, or editing without worrying about having to + update a central database +4. There are no versioning issues between updates of the BDFR, where old + version are stuck with a worse form of the database +5. An archive can be put on a USB, moved to another computer with possibly + a very different BDFR version, and work completely fine -Another major part of the ethos of the design is DOTADIW, Do One Thing And Do It Well. It's a major part of Unix philosophy and states that each tool should have a well-defined, limited purpose. To this end, the BDFR is, as the name implies, a *downloader*. That is the scope of the tool. Managing the files downloaded can be for better-suited programs, since the BDFR is not a file manager. Nor the BDFR concern itself with how any of the data downloaded is displayed, changed, parsed, or analysed. This makes the BDFR suitable for data science-related tasks, archiving, personal downloads, or analysis of various Reddit sources as the BDFR is completely agnostic on how the data is used. +Another major part of the ethos of the design is DOTADIW, Do One Thing And Do +It Well. It's a major part of Unix philosophy and states that each tool should +have a well-defined, limited purpose. To this end, the BDFR is, as the name +implies, a *downloader*. That is the scope of the tool. Managing the files +downloaded can be for better-suited programs, since the BDFR is not a file +manager. Nor the BDFR concern itself with how any of the data downloaded is +displayed, changed, parsed, or analysed. This makes the BDFR suitable for data +science-related tasks, archiving, personal downloads, or analysis of various +Reddit sources as the BDFR is completely agnostic on how the data is used. ## The Download Process -The BDFR is organised around a central object, the RedditDownloader class. The Archiver object extends and inherits from this class. +The BDFR is organised around a central object, the RedditDownloader class. The +Archiver object extends and inherits from this class. -1. The RedditDownloader parses all the arguments and configuration options, held in the Configuration object, and creates a variety of internal objects for use, such as the file name formatter, download filter, etc. -2. The RedditDownloader scrapes raw submissions from Reddit via several methods relating to different sources. A source is defined as a single stream of submissions from a subreddit, multireddit, or user list. -3. These raw submissions are passed to the DownloaderFactory class to select the specialised downloader class to use. Each of these are for a specific website or link type, with some catch-all classes like Direct. -4. The BaseDownloader child, spawned by DownloaderFactory, takes the link and does any necessary processing to find the direct link to the actual resource. -5. This is returned to the RedditDownloader in the form of a Resource object. This holds the URL and some other information for the final resource. +1. The RedditDownloader parses all the arguments and configuration options, + held in the Configuration object, and creates a variety of internal objects + for use, such as the file name formatter, download filter, etc. +2. The RedditDownloader scrapes raw submissions from Reddit via several methods + relating to different sources. A source is defined as a single stream of + submissions from a subreddit, multireddit, or user list. +3. These raw submissions are passed to the DownloaderFactory class to select + the specialised downloader class to use. Each of these are for a specific + website or link type, with some catch-all classes like Direct. +4. The BaseDownloader child, spawned by DownloaderFactory, takes the link and + does any necessary processing to find the direct link to the actual + resource. +5. This is returned to the RedditDownloader in the form of a Resource object. + This holds the URL and some other information for the final resource. 6. The Resource is passed through the DownloadFilter instantiated in step 1. -7. The destination file name for the Resource is calculated. If it already exists, then the Resource will be discarded. -8. Here the actual data is downloaded to the Resource and a hash calculated which is used to find duplicates. +7. The destination file name for the Resource is calculated. If it already + exists, then the Resource will be discarded. +8. Here the actual data is downloaded to the Resource and a hash calculated + which is used to find duplicates. 9. Only then is the Resource written to the disk. -This is the step-by-step process that the BDFR goes through to download a Reddit post. +This is the step-by-step process that the BDFR goes through to download +a Reddit post. ## Adding another Supported Site -This is one of the easiest changes to do with the code. First, any new class must inherit from the BaseDownloader class which provided an abstract parent to implement. However, take note of the other classes as well. Many downloaders can inherit from one another instead of just the BaseDownloader. For example, the VReddit class, used for downloading video from Reddit, inherits almost all of its code from the YouTube class. **Minimise code duplication wherever possible**. +This is one of the easiest changes to do with the code. First, any new class +must inherit from the BaseDownloader class which provided an abstract parent to +implement. However, take note of the other classes as well. Many downloaders +can inherit from one another instead of just the BaseDownloader. For example, +the VReddit class, used for downloading video from Reddit, inherits almost all +of its code from the YouTube class. **Minimise code duplication wherever +possible**. -Once the downloader class has been written **and tests added** for it as well, then the regex string for the site's URLs can be added to the DownloaderFactory. Then additional tests must be added for the DownloadFactory to ensure that the appropriate classes are called when the right URLs are passed to the factory. +Once the downloader class has been written **and tests added** for it as well, +then the regex string for the site's URLs can be added to the +DownloaderFactory. Then additional tests must be added for the DownloadFactory +to ensure that the appropriate classes are called when the right URLs are +passed to the factory. ## Adding Other Features -For a fundamentally different form of execution path for the program, such as the difference between the `archive` and `download` commands, it is best to inherit from the RedditDownloader class and override or add functionality as needed. +For a fundamentally different form of execution path for the program, such as +the difference between the `archive` and `download` commands, it is best to +inherit from the RedditDownloader class and override or add functionality as +needed. diff --git a/docs/CODE_OF_CONDUCT.md b/docs/CODE_OF_CONDUCT.md index fe0374d8..70e7e375 100644 --- a/docs/CODE_OF_CONDUCT.md +++ b/docs/CODE_OF_CONDUCT.md @@ -2,17 +2,14 @@ ## Our Pledge -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to making participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, gender identity and expression, level of experience, -education, socio-economic status, nationality, personal appearance, race, -religion, or sexual identity and orientation. +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making +participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, +disability, ethnicity, gender identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards -Examples of behavior that contributes to creating a positive environment -include: +Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences @@ -22,53 +19,41 @@ include: Examples of unacceptable behavior by participants include: -* The use of sexualized language or imagery and unwelcome sexual attention or - advances +* The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take +appropriate and fair corrective action in response to any instances of unacceptable behavior. -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, +issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any +contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope -This Code of Conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. Examples of -representing a project or community include using an official project e-mail -address, posting via an official social media account, or acting as an appointed -representative at an online or offline event. Representation of a project may be -further defined and clarified by project maintainers. +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the +project or its community. Examples of representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed representative at an online or offline +event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team via Discord. All complaints will -be reviewed and investigated and will result in a response that is deemed -necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an -incident. Further details of specific enforcement policies may be posted -separately. +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team via +Discord. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and +appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter +of an incident. Further details of specific enforcement policies may be posted separately. -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent +repercussions as determined by other members of the project's leadership. ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at + [homepage]: https://www.contributor-covenant.org diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 11688631..88459362 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -1,32 +1,52 @@ # Contributing -When making a contribution to the BDFR project, please open an issue beforehand so that the maintainers can weigh in on it. This helps create a trail on GitHub and keeps things organised. +When making a contribution to the BDFR project, please open an issue beforehand so that the maintainers can weigh in on +it. This helps create a trail on GitHub and keeps things organised. -**Please don't open an issue on GitHub** unless you are reporting a bug or proposing a feature. For questions, there is a discussion tab on the repository's GitHub page where you can interact with the developers and ask questions. If you believe that something is a bug, or that a feature should be added, then by all means open an issue. +**Please don't open an issue on GitHub** unless you are reporting a bug or proposing a feature. For questions, there is +a discussion tab on the repository's GitHub page where you can interact with the developers and ask questions. If you +believe that something is a bug, or that a feature should be added, then by all means open an issue. -All communication on GitHub, Discord, email, or any other medium must conform to the [Code of Conduct](CODE_OF_CONDUCT.md). It's not that hard to stay respectful. +All communication on GitHub, Discord, email, or any other medium must conform to the [Code of +Conduct](CODE_OF_CONDUCT.md). It's not that hard to stay respectful. ## Opening an Issue -**Before opening a new issue**, be sure that no issues regarding your problem already exist. If a similar issue exists, try to contribute to the issue. +**Before opening a new issue**, be sure that no issues regarding your problem already exist. If a similar issue exists, +try to contribute to the issue. -**If you are asking a question** about the functioning of the BDFR or the interface, please use the discussions page. Bug reports are not the right medium for asking and answering questions, and the discussions page makes it much easier to discuss, answer, and save questions and responses for others going forwards. +**If you are asking a question** about the functioning of the BDFR or the interface, please use the discussions page. +Bug reports are not the right medium for asking and answering questions, and the discussions page makes it much easier +to discuss, answer, and save questions and responses for others going forwards. ### Bugs -When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug. +When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log +file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where +this is. This log file will contain all the information required for the developers to recreate the bug. -If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug. +If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or +comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that +will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in +the thread of the bug. -Adding this information is **not optional**. If a bug report is opened without this information, it cannot be replicated by developers. The logs will be asked for once and if they are not supplied, the issue will be closed due to lack of information. +Adding this information is **not optional**. If a bug report is opened without this information, it cannot be replicated +by developers. The logs will be asked for once and if they are not supplied, the issue will be closed due to lack of +information. ### Feature requests -In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers. +In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what +you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It +is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has +a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no +reason, at the discretion of the developers. ## Pull Requests -Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) section below before actually writing any code. +Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that +the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) section below before actually +writing any code. Once you have done both of these, the below list shows the path that should be followed when writing a PR. @@ -38,13 +58,15 @@ Once you have done both of these, the below list shows the path that should be f 6. Open a pull request that references the relevant issue. 7. Expect changes or suggestions and heed the Code of Conduct. We're all volunteers here. -Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR. +Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this +won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR. ## Preparing the environment for development Bulk Downloader for Reddit requires Python 3.9 at minimum. First, ensure that your Python installation satisfies this. -BDfR is built in a way that it can be packaged and installed via `pip`. This places BDfR next to other Python packages and enables you to run the program from any directory. Since it is managed by pip, you can also uninstall it. +BDfR is built in a way that it can be packaged and installed via `pip`. This places BDfR next to other Python packages +and enables you to run the program from any directory. Since it is managed by pip, you can also uninstall it. To install the program, clone the repository and run pip inside the project's root directory: @@ -54,7 +76,9 @@ cd ./bulk-downloader-for-reddit python3 -m pip install -e . ``` -**`-e`** parameter creates a link to that folder. That is, any change inside the folder affects the package immidiately. So, when developing, you can be sure that the package is not stale and Python is always running your latest changes. (Due to this linking, moving/removing/renaming the folder might break it) +**`-e`** parameter creates a link to that folder. That is, any change inside the folder affects the package immidiately. +So, when developing, you can be sure that the package is not stale and Python is always running your latest changes. +(Due to this linking, moving/removing/renaming the folder might break it) Then, you can run the program from anywhere in your disk as such: @@ -62,7 +86,8 @@ Then, you can run the program from anywhere in your disk as such: bdfr ``` -There are additional Python packages that are required to develop the BDFR. These can be installed with the following command: +There are additional Python packages that are required to develop the BDFR. These can be installed with the following +command: ```bash python3 -m pip install -e .[dev] @@ -78,30 +103,40 @@ The BDFR project uses several tools to manage the code of the project. These inc - [tox](https://tox.wiki/en/latest/) - [pre-commit](https://github.com/pre-commit/pre-commit) -The first three tools are formatters. These change the code to the standards expected for the BDFR project. The configuration details for these tools are contained in the [pyproject.toml](../pyproject.toml) file for the project. +The first three tools are formatters. These change the code to the standards expected for the BDFR project. The +configuration details for these tools are contained in the [pyproject.toml](../pyproject.toml) file for the project. The tool `tox` is used to run tests and tools on demand and has the following environments: - `format` - `format_check` -The tool `pre-commit` is optional, and runs the three formatting tools automatically when a commit is made. This is **highly recommended** to ensure that all code submitted for this project is formatted acceptably. Note that any PR that does not follow the formatting guide will not be accepted. For information on how to use pre-commit to avoid this, see [the pre-commit documentation](https://pre-commit.com/). +The tool `pre-commit` is optional, and runs the three formatting tools automatically when a commit is made. This is +**highly recommended** to ensure that all code submitted for this project is formatted acceptably. Note that any PR that +does not follow the formatting guide will not be accepted. For information on how to use pre-commit to avoid this, see +[the pre-commit documentation](https://pre-commit.com/). ## Style Guide -The BDFR uses the Black formatting standard and enforces this with the tool by the same name. Additionally, the tool isort is used as well to format imports. +The BDFR uses the Black formatting standard and enforces this with the tool by the same name. Additionally, the tool +isort is used as well to format imports. -See [Preparing the Environment for Development](#preparing-the-environment-for-development) for how to setup these tools to run automatically. +See [Preparing the Environment for Development](#preparing-the-environment-for-development) for how to setup these tools +to run automatically. ## Tests ### Running Tests -There are a lot of tests in the BDFR. In fact, there are more tests than lines of functional code. This is one of the strengths of the BDFR in that it is fully tested. The codebase uses the package pytest to create the tests, which is a third-party package that provides many functions and objects useful for testing Python code. +There are a lot of tests in the BDFR. In fact, there are more tests than lines of functional code. This is one of the +strengths of the BDFR in that it is fully tested. The codebase uses the package pytest to create the tests, which is +a third-party package that provides many functions and objects useful for testing Python code. -When submitting a PR, it is required that you run **all** possible tests to ensure that any new commits haven't broken anything. Otherwise, while writing the request, it can be helpful (and much quicker) to run only a subset of the tests. +When submitting a PR, it is required that you run **all** possible tests to ensure that any new commits haven't broken +anything. Otherwise, while writing the request, it can be helpful (and much quicker) to run only a subset of the tests. -This is accomplished with marks, a system that pytest uses to categorise tests. There are currently the current marks in use in the BDFR test suite. +This is accomplished with marks, a system that pytest uses to categorise tests. There are currently the current marks in +use in the BDFR test suite. - `slow` - This marks a test that may take a long time to complete @@ -113,7 +148,9 @@ This is accomplished with marks, a system that pytest uses to categorise tests. - `authenticated` - This marks a test that requires a test configuration file with a valid OAuth2 token -These tests can be run either all at once, or excluding certain marks. The tests that require online resources, such as those marked `reddit` or `online`, will naturally require more time to run than tests that are entirely offline. To run tests, you must be in the root directory of the project and can use the following command. +These tests can be run either all at once, or excluding certain marks. The tests that require online resources, such as +those marked `reddit` or `online`, will naturally require more time to run than tests that are entirely offline. To run +tests, you must be in the root directory of the project and can use the following command. ```bash pytest @@ -128,18 +165,29 @@ pytest -m "not reddit and not authenticated" ### Configuration for authenticated tests -There should be configuration file `test_config.cfg` in the project's root directory to be able to run the integration tests with reddit authentication. See how to create such files [here](../README.md#configuration). The easiest way of creating this file is copying your existing `default_config.cfg` file from the path stated in the previous link and renaming it to `test_config.cfg` Be sure that user_token key exists in test_config.cfg. +There should be configuration file `test_config.cfg` in the project's root directory to be able to run the integration +tests with reddit authentication. See how to create such files [here](../README.md#configuration). The easiest way of +creating this file is copying your existing `default_config.cfg` file from the path stated in the previous link and +renaming it to `test_config.cfg` Be sure that user_token key exists in test_config.cfg. --- For more details, review the pytest documentation that is freely available online. -Many IDEs also provide integrated functionality to run and display the results from tests, and almost all of them support pytest in some capacity. This would be the recommended method due to the additional debugging and general capabilities. +Many IDEs also provide integrated functionality to run and display the results from tests, and almost all of them +support pytest in some capacity. This would be the recommended method due to the additional debugging and general +capabilities. ### Writing Tests -When writing tests, ensure that they follow the style guide. The BDFR uses pytest to run tests. Wherever possible, parameterise tests, even if you only have one test case. This makes it easier to expand in the future, as the ultimate goal is to have multiple test cases for every test, instead of just one. +When writing tests, ensure that they follow the style guide. The BDFR uses pytest to run tests. Wherever possible, +parameterise tests, even if you only have one test case. This makes it easier to expand in the future, as the ultimate +goal is to have multiple test cases for every test, instead of just one. -If required, use of mocks is expected to simplify tests and reduce the resources or complexity required. Tests should be as small as possible and test as small a part of the code as possible. Comprehensive or integration tests are run with the `click` framework and are located in their own file. +If required, use of mocks is expected to simplify tests and reduce the resources or complexity required. Tests should be +as small as possible and test as small a part of the code as possible. Comprehensive or integration tests are run with +the `click` framework and are located in their own file. -It is also expected that new tests be classified correctly with the marks described above i.e. if a test accesses Reddit through a `reddit_instance` object, it must be given the `reddit` mark. If it requires an authenticated Reddit instance, then it must have the `authenticated` mark. +It is also expected that new tests be classified correctly with the marks described above i.e. if a test accesses Reddit +through a `reddit_instance` object, it must be given the `reddit` mark. If it requires an authenticated Reddit instance, +then it must have the `authenticated` mark. From 0adf326d588e7c470f6bd40c890215323d563563 Mon Sep 17 00:00:00 2001 From: Soulsuck24 <79275800+Soulsuck24@users.noreply.github.com> Date: Mon, 26 Jun 2023 16:00:09 -0400 Subject: [PATCH 71/82] Chevereto downloader rename nsfw.pics to chevereto as it is the backened being used for lensdump as well. --- .../{nsfw_pics.py => chevereto.py} | 15 +++--- bdfr/site_downloaders/download_factory.py | 6 +-- .../{test_nsfw_pics.py => test_chevereto.py} | 46 +++++++++++++++++-- 3 files changed, 52 insertions(+), 15 deletions(-) rename bdfr/site_downloaders/{nsfw_pics.py => chevereto.py} (71%) rename tests/site_downloaders/{test_nsfw_pics.py => test_chevereto.py} (60%) diff --git a/bdfr/site_downloaders/nsfw_pics.py b/bdfr/site_downloaders/chevereto.py similarity index 71% rename from bdfr/site_downloaders/nsfw_pics.py rename to bdfr/site_downloaders/chevereto.py index d6221c5d..91ddc2fc 100644 --- a/bdfr/site_downloaders/nsfw_pics.py +++ b/bdfr/site_downloaders/chevereto.py @@ -12,21 +12,20 @@ logger = logging.getLogger(__name__) -class NsfwPics(BaseDownloader): +class Chevereto(BaseDownloader): def __init__(self, post: Submission) -> None: super().__init__(post) def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: links = self._get_links(self.post.url) if not links: - raise SiteDownloaderError("nsfw.pics parser could not find any links") - links = [Resource(self.post, link, Resource.retry_download(link)) for link in links] - return links + raise SiteDownloaderError("Chevereto parser could not find any links") + return [Resource(self.post, link, Resource.retry_download(link)) for link in links] @staticmethod def _get_album_links(url: str) -> list: image_pages = [] - album = NsfwPics.retrieve_url(f"{url}") + album = Chevereto.retrieve_url(f"{url}") soup = bs4.BeautifulSoup(album.text, "html.parser") album_divs = soup.find("div", attrs={"class": "pad-content-listing"}) links = album_divs.find_all("div", {"data-type": "image"}) @@ -37,10 +36,10 @@ def _get_album_links(url: str) -> list: @staticmethod def _get_links(url: str) -> set[str]: resources = [] - urls = NsfwPics._get_album_links(url) if "/album/" in url else [url] + urls = Chevereto._get_album_links(url) if "/album/" in url or "/a/" in url else [url] for url in urls: - page = NsfwPics.retrieve_url(url) + page = Chevereto.retrieve_url(url) soup = bs4.BeautifulSoup(page.text, "html.parser") - image_link = soup.find("input", attrs={"id": "embed-code-2"}).get("value") + image_link = soup.find("a", attrs={"data-action": lambda x: x and x.lower() == "download"}).get("href") resources.append(image_link) return set(resources) diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 194a3ec2..971096e0 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -6,6 +6,7 @@ from bdfr.exceptions import NotADownloadableLinkError from bdfr.site_downloaders.base_downloader import BaseDownloader from bdfr.site_downloaders.catbox import Catbox +from bdfr.site_downloaders.chevereto import Chevereto from bdfr.site_downloaders.delay_for_reddit import DelayForReddit from bdfr.site_downloaders.direct import Direct from bdfr.site_downloaders.erome import Erome @@ -15,7 +16,6 @@ from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgchest import Imgchest from bdfr.site_downloaders.imgur import Imgur -from bdfr.site_downloaders.nsfw_pics import NsfwPics from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs from bdfr.site_downloaders.self_post import SelfPost @@ -42,6 +42,8 @@ def pull_lever(url: str) -> type[BaseDownloader]: return Erome elif re.match(r"catbox\.moe", sanitised_url): return Catbox + elif re.match(r"lensdump\.com", sanitised_url) or re.match(r"nsfw\.pics", sanitised_url): + return Chevereto elif re.match(r"delayforreddit\.com", sanitised_url): return DelayForReddit elif re.match(r"flickr\.com", sanitised_url) or re.match(r"flic\.kr", sanitised_url): @@ -52,8 +54,6 @@ def pull_lever(url: str) -> type[BaseDownloader]: return Gallery elif re.match(r"imgchest\.com/p/", sanitised_url): return Imgchest - elif re.match(r"nsfw\.pics", sanitised_url): - return NsfwPics elif re.match(r"reddit\.com/r/", sanitised_url): return SelfPost elif re.match(r"(m\.)?youtu\.?be", sanitised_url): diff --git a/tests/site_downloaders/test_nsfw_pics.py b/tests/site_downloaders/test_chevereto.py similarity index 60% rename from tests/site_downloaders/test_nsfw_pics.py rename to tests/site_downloaders/test_chevereto.py index 7516807b..43fc0ab8 100644 --- a/tests/site_downloaders/test_nsfw_pics.py +++ b/tests/site_downloaders/test_chevereto.py @@ -3,7 +3,7 @@ import pytest from bdfr.resource import Resource -from bdfr.site_downloaders.nsfw_pics import NsfwPics +from bdfr.site_downloaders.chevereto import Chevereto @pytest.mark.online @@ -20,10 +20,20 @@ "https://nsfw.pics/image/O6du", }, ), + ( + "https://lensdump.com/a/Vb411", # Album + { + "https://lensdump.com/i/CDIUci", + "https://lensdump.com/i/CDIXZo", + "https://lensdump.com/i/CDIwD2", + "https://lensdump.com/i/CDI5VC", + "https://lensdump.com/i/CDIGn5", + }, + ), ), ) def test_get_album(test_url: str, expected: set[str]): - results = NsfwPics._get_album_links(test_url) + results = Chevereto._get_album_links(test_url) assert len(results) == len(expected) assert sorted(results) == sorted(expected) @@ -42,14 +52,28 @@ def test_get_album(test_url: str, expected: set[str]): "https://i.nsfw.pics/c447389dee315f5960eb29671fb56232.jpeg", }, ), + ( + "https://lensdump.com/a/Vb411", # Album + { + "https://i3.lensdump.com/i/CDIUci.gif?open=true", + "https://i.lensdump.com/i/CDIXZo.jpeg?open=true", + "https://i1.lensdump.com/i/CDIwD2.jpeg?open=true", + "https://i3.lensdump.com/i/CDI5VC.gif?open=true", + "https://i1.lensdump.com/i/CDIGn5.jpeg?open=true", + }, + ), ( "https://nsfw.pics/image/OdfV", # Single image {"https://i.nsfw.pics/b8007b506022132fe857eead3dc98a92.gif"}, ), + ( + "https://lensdump.com/i/CDIUci", # Single image + {"https://i3.lensdump.com/i/CDIUci.gif?open=true"}, + ), ), ) def test_get_links(test_url: str, expected: set[str]): - results = NsfwPics._get_links(test_url) + results = Chevereto._get_links(test_url) assert sorted(results) == sorted(expected) @@ -68,16 +92,30 @@ def test_get_links(test_url: str, expected: set[str]): "fb60e0a42a0f7f0929f5a5ae401a3518", }, ), + ( + "https://lensdump.com/a/Vb411", # Album + { + "9ceac1e26c4799b0a6b7d5453a73f53b", + "54391b5210286bd01224f1f513159e82", + "907f92b1c295d5f84f4f64aacc960079", + "14d911ebc49fb82e5657c8ac827a2b32", + "a66d093b4fe19a1cb4b5e10bc34d17bb", + }, + ), ( "https://nsfw.pics/image/OdfV", # Single image {"9ceac1e26c4799b0a6b7d5453a73f53b"}, ), + ( + "https://lensdump.com/i/CDIUci", # Single image + {"9ceac1e26c4799b0a6b7d5453a73f53b"}, + ), ), ) def test_download_resources(test_url: str, expected_hashes: set[str]): mock_download = Mock() mock_download.url = test_url - downloader = NsfwPics(mock_download) + downloader = Chevereto(mock_download) results = downloader.find_resources() assert all(isinstance(res, Resource) for res in results) [res.download() for res in results] From 29c734466daf9fdea59b6fc5d2e9741e6a60b7cd Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 8 Jul 2023 19:39:30 +1000 Subject: [PATCH 72/82] Bump version --- bdfr/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bdfr/__init__.py b/bdfr/__init__.py index 0576f7d5..38d65ba2 100644 --- a/bdfr/__init__.py +++ b/bdfr/__init__.py @@ -1,3 +1,3 @@ #!/usr/bin/env python3 -__version__ = "2.6.2" +__version__ = "2.7.0" From c1ed7e8b6ebb8e351dc9b5979197a7cbe18e51a7 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sun, 16 Jul 2023 13:50:57 +1000 Subject: [PATCH 73/82] Add warnings for default and no app id --- bdfr/connector.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bdfr/connector.py b/bdfr/connector.py index 1f2c334a..fd546df5 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -135,6 +135,12 @@ def create_reddit_instance(self) -> None: logger.debug("Using authenticated Reddit instance") client_id = self.cfg_parser.get("DEFAULT", "client_id") client_secret = self.cfg_parser.get("DEFAULT", "client_secret", fallback=None) + if client_id == "U-6gk4ZCh3IeNQ": + logger.warning( + "You are using the default app ID for the BDFR; this will result in you sharing a request quota" + " with every other user. It is recommended to create your own app and put the ID and secret" + " in the configuration file" + ) if client_secret and client_secret.lower() == "none": client_secret = None if not self.cfg_parser.has_option("DEFAULT", "user_token"): @@ -162,6 +168,9 @@ def create_reddit_instance(self) -> None: ) else: logger.debug("Using unauthenticated Reddit instance") + logger.warning( + "Using an unauthenticated app like this will result in Reddit limiting queries to 10 requests a minute" + ) self.authenticated = False client_secret = self.cfg_parser.get("DEFAULT", "client_secret", fallback=None) if client_secret and client_secret.lower() == "none": From 610bf239b97c848bea3569ec31ce1008896c3c77 Mon Sep 17 00:00:00 2001 From: Serene-Arc Date: Sat, 29 Jul 2023 14:25:46 +1000 Subject: [PATCH 74/82] Add ratelimit to praw object --- bdfr/connector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bdfr/connector.py b/bdfr/connector.py index fd546df5..ed07b8d4 100644 --- a/bdfr/connector.py +++ b/bdfr/connector.py @@ -165,6 +165,7 @@ def create_reddit_instance(self) -> None: client_secret=client_secret, user_agent=self.user_agent, token_manager=token_manager, + ratelimit_seconds=120, ) else: logger.debug("Using unauthenticated Reddit instance") @@ -179,6 +180,7 @@ def create_reddit_instance(self) -> None: client_id=self.cfg_parser.get("DEFAULT", "client_id"), client_secret=client_secret, user_agent=self.user_agent, + ratelimit_seconds=120, ) def retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]: From b02eefd934c006f34de4077a530722c4490389a4 Mon Sep 17 00:00:00 2001 From: vladislav doster <10052309+vladdoster@users.noreply.github.com> Date: Sun, 1 Oct 2023 06:25:26 -0500 Subject: [PATCH 75/82] fix: f-strings in clone command exception --- bdfr/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bdfr/__main__.py b/bdfr/__main__.py index c263769c..9aa2f994 100644 --- a/bdfr/__main__.py +++ b/bdfr/__main__.py @@ -165,10 +165,10 @@ def cli_clone(context: click.Context, **_) -> None: reddit_scraper = RedditCloner(config, [stream]) reddit_scraper.download() except Exception: - logger.exception("Scraper exited unexpectedly - BDFR Scraper v{__version__}") + logger.exception(f"Scraper exited unexpectedly - BDFR Scraper v{__version__}") raise else: - logger.info("Program complete - BDFR Cloner v{__version__}") + logger.info(f"Program complete - BDFR Cloner v{__version__}") @cli.command("completions") From c6c9f2d94d41f63056d4adb367f7f14035fa745c Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Mon, 15 Jan 2024 15:15:15 +0100 Subject: [PATCH 76/82] fixed ruff instructions --- .github/workflows/test.yml | 4 ++-- pyproject.toml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 80e21229..457d5cfa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -48,7 +48,7 @@ jobs: - name: Critical ruff lint run: | - ruff check --format=github --select=E9,F63,F7,F82 . + ruff check --select=E9,F63,F7,F82 . - name: Test with pytest run: | @@ -62,4 +62,4 @@ jobs: - name: Full ruff lint run: | - ruff check --format=github . --exit-zero + ruff check . --exit-zero diff --git a/pyproject.toml b/pyproject.toml index f39997e2..14780386 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,6 @@ exclude = ["scripts/tests"] external = ["FURB123"] flake8-annotations = {"allow-star-arg-any" = true, "suppress-dummy-args" = true} flake8-pytest-style = {"parametrize-values-type" = "tuple", "mark-parentheses" = false} -format = "grouped" ignore = ["ANN101","B904","N818","RET505"] line-length = 120 per-file-ignores={"tests/*"=["ANN","S101","S105","S106"], "scripts/*"=["INP","S105","S106"]} From ff2704974e3db01761c6075a50271668ea108d15 Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Mon, 15 Jan 2024 15:18:32 +0100 Subject: [PATCH 77/82] fix linted errors --- tests/integration_tests/test_download_integration.py | 2 -- tests/test_file_name_formatter.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration_tests/test_download_integration.py b/tests/integration_tests/test_download_integration.py index d14920f5..bedf7237 100644 --- a/tests/integration_tests/test_download_integration.py +++ b/tests/integration_tests/test_download_integration.py @@ -41,8 +41,6 @@ def create_basic_args_for_download_runner(test_args: list[str], run_path: Path): ( ["-s", "EmpireDidNothingWrong", "-L", 3], ["-s", "r/EmpireDidNothingWrong", "-L", 3], - ["-s", "r/EmpireDidNothingWrong", "-L", 3], - ["-s", "EmpireDidNothingWrong", "-L", 3], ["-s", "https://www.reddit.com/r/TrollXChromosomes/", "-L", 3], ["-s", "r/TrollXChromosomes/", "-L", 3], ["-s", "TrollXChromosomes/", "-L", 3], diff --git a/tests/test_file_name_formatter.py b/tests/test_file_name_formatter.py index daf6526d..fdacf405 100644 --- a/tests/test_file_name_formatter.py +++ b/tests/test_file_name_formatter.py @@ -531,6 +531,6 @@ def test_name_submission( def test_shortened_file_name_ending( test_filename: str, test_ending: str, expected_end: str, test_formatter: FileNameFormatter ): - result = test_formatter.limit_file_name_length(test_filename, test_ending, Path(".")) + result = test_formatter.limit_file_name_length(test_filename, test_ending, Path()) assert result.name.endswith(expected_end) assert len(str(result)) <= FileNameFormatter.find_max_path_length() From 9d2bda70f5580119f3ed399573ff2acdcd98fdea Mon Sep 17 00:00:00 2001 From: Piotr Migdal Date: Thu, 18 Jan 2024 18:00:24 +0100 Subject: [PATCH 78/82] bumped ruff to 0.1.13 --- .pre-commit-config.yaml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f97a54b..a3ff110c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: name: black - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.272 + rev: v0.1.13 hooks: - id: ruff name: ruff diff --git a/pyproject.toml b/pyproject.toml index 14780386..3b499d3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dev = [ "black>=23.3.0", "pre-commit>=3.0.4", "pytest>=7.2.1", - "ruff>=0.0.272", + "ruff>=0.1.13", "tox>=3.27.1", ] From f198904eace5dbd6d9cb187edc670816a36b4694 Mon Sep 17 00:00:00 2001 From: Serene-Arc <33189705+Serene-Arc@users.noreply.github.com> Date: Sat, 24 Aug 2024 12:48:40 +1000 Subject: [PATCH 79/82] Remove gfycat since site is defunct --- bdfr/site_downloaders/download_factory.py | 3 - bdfr/site_downloaders/gfycat.py | 73 ------------------- .../site_downloaders/test_download_factory.py | 2 - tests/site_downloaders/test_gfycat.py | 59 --------------- 4 files changed, 137 deletions(-) delete mode 100644 bdfr/site_downloaders/gfycat.py delete mode 100644 tests/site_downloaders/test_gfycat.py diff --git a/bdfr/site_downloaders/download_factory.py b/bdfr/site_downloaders/download_factory.py index 971096e0..a32a60dd 100644 --- a/bdfr/site_downloaders/download_factory.py +++ b/bdfr/site_downloaders/download_factory.py @@ -13,7 +13,6 @@ from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback from bdfr.site_downloaders.flickr import Flickr from bdfr.site_downloaders.gallery import Gallery -from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgchest import Imgchest from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.pornhub import PornHub @@ -32,8 +31,6 @@ def pull_lever(url: str) -> type[BaseDownloader]: return Imgur elif re.match(r"(i\.|thumbs\d{1,2}\.|v\d\.)?(redgifs|gifdeliverynetwork)", sanitised_url): return Redgifs - elif re.match(r"(thumbs\.|giant\.)?gfycat\.", sanitised_url): - return Gfycat elif re.match(r".*/.*\.[a-zA-Z34]{3,4}(\?[\w;&=]*)?$", sanitised_url) and not DownloadFactory.is_web_resource( sanitised_url ): diff --git a/bdfr/site_downloaders/gfycat.py b/bdfr/site_downloaders/gfycat.py deleted file mode 100644 index 02a1e7e1..00000000 --- a/bdfr/site_downloaders/gfycat.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 - -import json -import re -from typing import Optional - -from cachetools import TTLCache, cached -from praw.models import Submission - -from bdfr.exceptions import SiteDownloaderError -from bdfr.resource import Resource -from bdfr.site_authenticator import SiteAuthenticator -from bdfr.site_downloaders.redgifs import Redgifs - - -class Gfycat(Redgifs): - def __init__(self, post: Submission) -> None: - super().__init__(post) - - def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]: - return super().find_resources(authenticator) - - @staticmethod - @cached(cache=TTLCache(maxsize=5, ttl=3420)) - def _get_auth_token() -> str: - headers = { - "content-type": "text/plain;charset=UTF-8", - "host": "weblogin.gfycat.com", - "origin": "https://gfycat.com", - } - payload = {"access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2M95Rfa5FLLhPFucu8H5HTzeutyAa"} - token = json.loads( - Gfycat.post_url("https://weblogin.gfycat.com/oauth/webtoken", headers=headers, payload=payload).text - )["access_token"] - return token - - @staticmethod - def _get_link(url: str) -> set[str]: - gfycat_id = re.match(r".*/(.*?)(?:/?|-.*|\..{3-4})$", url).group(1) - url = "https://gfycat.com/" + gfycat_id - - response = Gfycat.retrieve_url(url) - if re.search(r"(redgifs|gifdeliverynetwork)", response.url): - url = url.lower() - return Redgifs._get_link(url) - - auth_token = Gfycat._get_auth_token() - if not auth_token: - raise SiteDownloaderError("Unable to retrieve Gfycat API token") - - headers = { - "referer": "https://gfycat.com/", - "origin": "https://gfycat.com", - "content-type": "application/json", - "Authorization": f"Bearer {auth_token}", - } - content = Gfycat.retrieve_url(f"https://api.gfycat.com/v1/gfycats/{gfycat_id}", headers=headers) - - if content is None: - raise SiteDownloaderError("Could not read the API source") - - try: - response_json = json.loads(content.text) - except json.JSONDecodeError as e: - raise SiteDownloaderError(f"Received data was not valid JSON: {e}") - - try: - out = response_json["gfyItem"]["mp4Url"] - except (IndexError, KeyError, AttributeError) as e: - raise SiteDownloaderError(f"Failed to download Gfycat link {url}: {e}") - return { - out, - } diff --git a/tests/site_downloaders/test_download_factory.py b/tests/site_downloaders/test_download_factory.py index 2c25d7b5..0684523d 100644 --- a/tests/site_downloaders/test_download_factory.py +++ b/tests/site_downloaders/test_download_factory.py @@ -10,7 +10,6 @@ from bdfr.site_downloaders.erome import Erome from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback from bdfr.site_downloaders.gallery import Gallery -from bdfr.site_downloaders.gfycat import Gfycat from bdfr.site_downloaders.imgur import Imgur from bdfr.site_downloaders.pornhub import PornHub from bdfr.site_downloaders.redgifs import Redgifs @@ -35,7 +34,6 @@ ("https://imgur.com/a/MkxAzeg", Imgur), ("https://m.imgur.com/a/py3RW0j", Imgur), ("https://www.reddit.com/gallery/lu93m7", Gallery), - ("https://gfycat.com/concretecheerfulfinwhale", Gfycat), ("https://www.erome.com/a/NWGw0F09", Erome), ("https://youtube.com/watch?v=Gv8Wz74FjVA", Youtube), ("https://redgifs.com/watch/courageousimpeccablecanvasback", Redgifs), diff --git a/tests/site_downloaders/test_gfycat.py b/tests/site_downloaders/test_gfycat.py deleted file mode 100644 index 0daaeb6f..00000000 --- a/tests/site_downloaders/test_gfycat.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -from unittest.mock import Mock - -import pytest - -from bdfr.resource import Resource -from bdfr.site_downloaders.gfycat import Gfycat - - -@pytest.mark.online -def test_auth_cache(): - auth1 = Gfycat._get_auth_token() - auth2 = Gfycat._get_auth_token() - assert auth1 == auth2 - - -@pytest.mark.online -@pytest.mark.parametrize( - ("test_url", "expected_url"), - ( - ("https://gfycat.com/definitivecaninecrayfish", "https://giant.gfycat.com/DefinitiveCanineCrayfish.mp4"), - ("https://gfycat.com/dazzlingsilkyiguana", "https://giant.gfycat.com/DazzlingSilkyIguana.mp4"), - ("https://gfycat.com/ComposedWholeBullfrog", "https://thumbs44.redgifs.com/ComposedWholeBullfrog.mp4"), - ( - "https://thumbs.gfycat.com/ComposedWholeBullfrog-size_restricted.gif", - "https://thumbs44.redgifs.com/ComposedWholeBullfrog.mp4", - ), - ( - "https://giant.gfycat.com/ComposedWholeBullfrog.mp4", - "https://thumbs44.redgifs.com/ComposedWholeBullfrog.mp4", - ), - ), -) -def test_get_link(test_url: str, expected_url: str): - result = Gfycat._get_link(test_url) - assert expected_url in result.pop() - - -@pytest.mark.online -@pytest.mark.parametrize( - ("test_url", "expected_hash"), - ( - ("https://gfycat.com/definitivecaninecrayfish", "48f9bd4dbec1556d7838885612b13b39"), - ("https://gfycat.com/dazzlingsilkyiguana", "808941b48fc1e28713d36dd7ed9dc648"), - ("https://gfycat.com/ComposedWholeBullfrog", "5292343665a13b5369d889d911ae284d"), - ("https://thumbs.gfycat.com/ComposedWholeBullfrog-size_restricted.gif", "5292343665a13b5369d889d911ae284d"), - ("https://giant.gfycat.com/ComposedWholeBullfrog.mp4", "5292343665a13b5369d889d911ae284d"), - ), -) -def test_download_resource(test_url: str, expected_hash: str): - mock_submission = Mock() - mock_submission.url = test_url - test_site = Gfycat(mock_submission) - resources = test_site.find_resources() - assert len(resources) == 1 - assert isinstance(resources[0], Resource) - resources[0].download() - assert resources[0].hash.hexdigest() == expected_hash From 8d71e773fd64e65973097bca1f29c7fd1b913537 Mon Sep 17 00:00:00 2001 From: Serene-Arc <33189705+Serene-Arc@users.noreply.github.com> Date: Thu, 14 Nov 2024 12:03:00 +1000 Subject: [PATCH 80/82] Merge tests --- tests/site_downloaders/test_chevereto.py | 42 ++++++------------------ 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/tests/site_downloaders/test_chevereto.py b/tests/site_downloaders/test_chevereto.py index 43fc0ab8..42cdcf85 100644 --- a/tests/site_downloaders/test_chevereto.py +++ b/tests/site_downloaders/test_chevereto.py @@ -6,38 +6,6 @@ from bdfr.site_downloaders.chevereto import Chevereto -@pytest.mark.online -@pytest.mark.parametrize( - ("test_url", "expected"), - ( - ( - "https://nsfw.pics/album/Test.l2t", # Album - { - "https://nsfw.pics/image/OdfV", - "https://nsfw.pics/image/ObUF", - "https://nsfw.pics/image/OOV7", - "https://nsfw.pics/image/OD71", - "https://nsfw.pics/image/O6du", - }, - ), - ( - "https://lensdump.com/a/Vb411", # Album - { - "https://lensdump.com/i/CDIUci", - "https://lensdump.com/i/CDIXZo", - "https://lensdump.com/i/CDIwD2", - "https://lensdump.com/i/CDI5VC", - "https://lensdump.com/i/CDIGn5", - }, - ), - ), -) -def test_get_album(test_url: str, expected: set[str]): - results = Chevereto._get_album_links(test_url) - assert len(results) == len(expected) - assert sorted(results) == sorted(expected) - - @pytest.mark.online @pytest.mark.parametrize( ("test_url", "expected"), @@ -70,6 +38,16 @@ def test_get_album(test_url: str, expected: set[str]): "https://lensdump.com/i/CDIUci", # Single image {"https://i3.lensdump.com/i/CDIUci.gif?open=true"}, ), + ( + "https://lensdump.com/a/Vb411", # Album + { + "https://lensdump.com/i/CDIUci", + "https://lensdump.com/i/CDIXZo", + "https://lensdump.com/i/CDIwD2", + "https://lensdump.com/i/CDI5VC", + "https://lensdump.com/i/CDIGn5", + }, + ), ), ) def test_get_links(test_url: str, expected: set[str]): From 89521d968768546c6c236c0869479ed12e08a969 Mon Sep 17 00:00:00 2001 From: Serene-Arc <33189705+Serene-Arc@users.noreply.github.com> Date: Thu, 14 Nov 2024 12:05:56 +1000 Subject: [PATCH 81/82] Remove test case for domain that no longer exists --- tests/site_downloaders/test_chevereto.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/site_downloaders/test_chevereto.py b/tests/site_downloaders/test_chevereto.py index 42cdcf85..f70c1207 100644 --- a/tests/site_downloaders/test_chevereto.py +++ b/tests/site_downloaders/test_chevereto.py @@ -10,16 +10,6 @@ @pytest.mark.parametrize( ("test_url", "expected"), ( - ( - "https://nsfw.pics/album/Test.l2t", # Album - { - "https://i.nsfw.pics/b8007b506022132fe857eead3dc98a92.gif", - "https://i.nsfw.pics/aa0541830d5d16743bca9bfb48e16b7b.gif", - "https://i.nsfw.pics/b4afb5a33e68d3d74a547f62684cddc9.jpeg", - "https://i.nsfw.pics/131ed0764342b570a338af37cdd75e3e.jpeg", - "https://i.nsfw.pics/c447389dee315f5960eb29671fb56232.jpeg", - }, - ), ( "https://lensdump.com/a/Vb411", # Album { @@ -30,10 +20,6 @@ "https://i1.lensdump.com/i/CDIGn5.jpeg?open=true", }, ), - ( - "https://nsfw.pics/image/OdfV", # Single image - {"https://i.nsfw.pics/b8007b506022132fe857eead3dc98a92.gif"}, - ), ( "https://lensdump.com/i/CDIUci", # Single image {"https://i3.lensdump.com/i/CDIUci.gif?open=true"}, From b5b47f93e8380c9ef576eb52cd4c80d2fadf905a Mon Sep 17 00:00:00 2001 From: Serene-Arc <33189705+Serene-Arc@users.noreply.github.com> Date: Thu, 14 Nov 2024 12:11:18 +1000 Subject: [PATCH 82/82] Remove link to resources that no longer exist --- tests/site_downloaders/test_redgifs.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tests/site_downloaders/test_redgifs.py b/tests/site_downloaders/test_redgifs.py index b8a6ed40..05f8af10 100644 --- a/tests/site_downloaders/test_redgifs.py +++ b/tests/site_downloaders/test_redgifs.py @@ -41,17 +41,6 @@ def test_get_id(test_url: str, expected: str): ("https://redgifs.com/watch/springgreendecisivetaruca", {"SpringgreenDecisiveTaruca.mp4"}), ("https://www.redgifs.com/watch/palegoldenrodrawhalibut", {"PalegoldenrodRawHalibut.mp4"}), ("https://redgifs.com/watch/hollowintentsnowyowl", {"HollowIntentSnowyowl-large.jpg"}), - ( - "https://www.redgifs.com/watch/lustrousstickywaxwing", - { - "EntireEnchantingHypsilophodon-large.jpg", - "FancyMagnificentAdamsstaghornedbeetle-large.jpg", - "LustrousStickyWaxwing-large.jpg", - "ParchedWindyArmyworm-large.jpg", - "ThunderousColorlessErmine-large.jpg", - "UnripeUnkemptWoodpecker-large.jpg", - }, - ), ("https://www.redgifs.com/watch/genuineprivateguillemot/", {"GenuinePrivateGuillemot.mp4"}), ), ) @@ -71,17 +60,6 @@ def test_get_link(test_url: str, expected: set[str]): ("https://redgifs.com/watch/leafysaltydungbeetle", {"076792c660b9c024c0471ef4759af8bd"}), ("https://www.redgifs.com/watch/palegoldenrodrawhalibut", {"46d5aa77fe80c6407de1ecc92801c10e"}), ("https://redgifs.com/watch/hollowintentsnowyowl", {"5ee51fa15e0a58e98f11dea6a6cca771"}), - ( - "https://www.redgifs.com/watch/lustrousstickywaxwing", - { - "b461e55664f07bed8d2f41d8586728fa", - "30ba079a8ed7d7adf17929dc3064c10f", - "0d4f149d170d29fc2f015c1121bab18b", - "53987d99cfd77fd65b5fdade3718f9f1", - "fb2e7d972846b83bf4016447d3060d60", - "44fb28f72ec9a5cca63fa4369ab4f672", - }, - ), ("https://thumbs46.redgifs.com/BabyishCharmingAidi-medium.jpg", {"bf14b9f3d5b630cb5fd271661226f1af"}), ), )