From dbaea2319138a2b5d062fe80ea4924334628816e Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Mon, 10 Jun 2024 15:56:16 -0400 Subject: [PATCH 01/15] Set TMPDIR in the environment to a unique directory for each browser instance and delete it when the browser quits. This is a workaround for an issue with geckodriver. When the OpenWPM extension is installed via `WebDriver.install_addon()`, geckodriver makes a copy of the XPI file in TMPDIR. However, geckodriver never deletes that file. So on a stateless crawl, you end up with one copy of the XPI file for each site visited. This workaround sets TMPDIR in the environment before creating the geckodriver service, and then deletes the directory after `driver.quit()` returns in `BrowserManager.run()`. We use this indirection because we don't have access to the name of the temporary file, and it doesn't seem safe to just delete XPI files in /tmp willy-nilly. --- openwpm/browser_manager.py | 15 +++++++++++++++ openwpm/config.py | 13 +++++++++++++ openwpm/deploy_browsers/deploy_firefox.py | 14 ++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index d9c65034d..0368f679c 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -764,6 +764,21 @@ def run(self) -> None: if isinstance(command, ShutdownSignal): driver.quit() + # Delete the temporary directory used by this browser. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) self.status_queue.put("OK") return diff --git a/openwpm/config.py b/openwpm/config.py index 1898d3901..5471aeda8 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -104,6 +104,7 @@ class BrowserParams(DataClassJsonMixin): default=Path(tempfile.gettempdir()), metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) + """ The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated browser profiles and residual files are stored. @@ -140,6 +141,18 @@ class BrowserParams(DataClassJsonMixin): """ + tmpdir: Path = field( + default=Path(tempfile.gettempdir()), + metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), + ) + """ + The temporary directory used by `geckodriver`. This is confiured when the + browser is deployed in deploy_browsers.deploy_firefox. We need it in order + to delete it when the browser is closed down, because `geckodriver` doesn't + appear to delete temporary files that it creates, such as a copy of the + extension XPI file. + """ + recovery_tar: Optional[Path] = None donottrack: bool = False tracking_protection: bool = False diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 5e0647065..12e5c4c93 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -138,6 +138,19 @@ def deploy_firefox( ) fo.set_preference(name, value) + # Create a temporary directory for this instance of geckodriver that + # we can delete later. + env = os.environ + browser_params.tmpdir = tempfile.mkdtemp( + prefix="openwpm_", + dir=os.getenv('TMPDIR', default='/tmp') + ) + env['TMPDIR'] = browser_params.tmpdir + logger.debug( + "BROWSER %i: Using temp dir %s" % + (browser_params.browser_id, browser_params.tmpdir) + ) + # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) @@ -150,6 +163,7 @@ def deploy_firefox( service=Service( executable_path=geckodriver_path, log_output=open(webdriver_interceptor.fifo, "w"), + env=env ), ) From 6e10cedd324cee79123dbc8280c30d684b4f5bd0 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Tue, 11 Jun 2024 15:53:50 -0400 Subject: [PATCH 02/15] Reorganized temporary directory management. - Temporary directory is created in `BrowserManagerHandle.launch_browser_manager`. - Temporary directory is deleted in `BrowserManagerHandle.shutdown_browser`. - Temporary directory is added to environment for `selenium.webdriver.firefox.service.Service` in `deploy_browsers.deploy_firefox() --- openwpm/browser_manager.py | 43 +++++++++++++++-------- openwpm/deploy_browsers/deploy_firefox.py | 19 ++++------ scripts/install-firefox.sh | 3 +- 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 0368f679c..60b87e572 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -131,6 +131,18 @@ def launch_browser_manager(self) -> bool: crash_recovery = True + # Create a unique temporary directory that we can delete + # when we shut down. Note that this doesn't force anything to + # use `tmpdir`, it just makes it available. + self.browser_params.tmpdir = Path(tempfile.mkdtemp( + prefix="openwpm_", + dir=os.getenv('TMPDIR', default='/tmp') + )) + self.logger.debug( + "BROWSER %i: Using temp dir %s" % + (self.browser_params.browser_id, self.browser_params.tmpdir) + ) + self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) self.is_fresh = not crash_recovery @@ -640,6 +652,22 @@ def shutdown_browser(self, during_init: bool, force: bool = False) -> None: if self.current_profile_path is not None: shutil.rmtree(self.current_profile_path, ignore_errors=True) + # Delete the temporary directory used by geckodriver. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) + class BrowserManager(Process): """ @@ -764,21 +792,6 @@ def run(self) -> None: if isinstance(command, ShutdownSignal): driver.quit() - # Delete the temporary directory used by this browser. - try: - self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) - ) - shutil.rmtree(self.browser_params.tmpdir) - except Exception as e: - self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir, - str(e)) - ) self.status_queue.put("OK") return diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 12e5c4c93..f64096541 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -138,22 +138,15 @@ def deploy_firefox( ) fo.set_preference(name, value) - # Create a temporary directory for this instance of geckodriver that - # we can delete later. - env = os.environ - browser_params.tmpdir = tempfile.mkdtemp( - prefix="openwpm_", - dir=os.getenv('TMPDIR', default='/tmp') - ) - env['TMPDIR'] = browser_params.tmpdir - logger.debug( - "BROWSER %i: Using temp dir %s" % - (browser_params.browser_id, browser_params.tmpdir) - ) - # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) + # Use browser_params.tmpdir as the temporary directory. This is so that + # geckodriver makes its copy of the extension XPI file in tmpdir, so + # we can delete it later and not have it left behind. + env = os.environ + env['TMPDIR'] = str(browser_params.tmpdir) + fo.binary_location = firefox_binary_path geckodriver_path = subprocess.check_output( "which geckodriver", encoding="utf-8", shell=True diff --git a/scripts/install-firefox.sh b/scripts/install-firefox.sh index 363f8f27e..822b9cc51 100755 --- a/scripts/install-firefox.sh +++ b/scripts/install-firefox.sh @@ -9,7 +9,8 @@ set -e # Note this script is **destructive** and will # remove the existing Firefox in the OpenWPM directory -TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE +# TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE +TAG='6c033deedc28e5dadb0b99de7336cb6ebb336631' # FIREFOX_126_0_1_RELEASE case "$(uname -s)" in Darwin) From 5849b9396b2e630c47730e5be743ed11c9276d0b Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Tue, 11 Jun 2024 16:03:44 -0400 Subject: [PATCH 03/15] Undid accidental adding of local mod to install-firefox.sh. --- scripts/install-firefox.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/install-firefox.sh b/scripts/install-firefox.sh index 822b9cc51..363f8f27e 100755 --- a/scripts/install-firefox.sh +++ b/scripts/install-firefox.sh @@ -9,8 +9,7 @@ set -e # Note this script is **destructive** and will # remove the existing Firefox in the OpenWPM directory -# TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE -TAG='6c033deedc28e5dadb0b99de7336cb6ebb336631' # FIREFOX_126_0_1_RELEASE +TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE case "$(uname -s)" in Darwin) From ae882fec67ad63f81da4411a454a15787cd12d67 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Thu, 13 Jun 2024 10:03:17 -0400 Subject: [PATCH 04/15] Moved temporary directory cleanup to `BrowserManagerHandle.close_browser_manager`. --- openwpm/browser_manager.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 60b87e572..478ec09f6 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -344,9 +344,24 @@ def close_browser_manager(self, force: bool = False) -> None: ) return - self.logger.debug( - "BROWSER %i: Browser manager closed successfully." % self.browser_id - ) + # Delete the temporary directory used by geckodriver. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) + self.logger.debug( + "BROWSER %i: Browser manager closed successfully." % self.browser_id + ) shutdown_complete = True finally: if not shutdown_complete: @@ -652,22 +667,6 @@ def shutdown_browser(self, during_init: bool, force: bool = False) -> None: if self.current_profile_path is not None: shutil.rmtree(self.current_profile_path, ignore_errors=True) - # Delete the temporary directory used by geckodriver. - try: - self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) - ) - shutil.rmtree(self.browser_params.tmpdir) - except Exception as e: - self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir, - str(e)) - ) - class BrowserManager(Process): """ From 7477aa6114bd1fcef5a51d3f8bb9ef5c3570d40f Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Wed, 26 Jun 2024 14:34:40 -0400 Subject: [PATCH 05/15] Delete leftover temporary directory. `BrowserParams.tmpdir` is now `Optional[Path]`. Value `None` means no temporary directory has been set, `Some p` means it has been set to `p`. When the browser manager is launched, if `tmpdir` is not `None`, try to delete it, assuming that it is leftover from some failure that prevented normal deletion. Regardless, then make a new temporary directory. Make sure to set it to `None` when the temporary directory is deleted during normal cleanup. --- openwpm/browser_manager.py | 15 +++++++++++++++ openwpm/config.py | 14 +++++++------- openwpm/deploy_browsers/deploy_firefox.py | 6 ++++-- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 478ec09f6..b7e161725 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -134,6 +134,20 @@ def launch_browser_manager(self) -> bool: # Create a unique temporary directory that we can delete # when we shut down. Note that this doesn't force anything to # use `tmpdir`, it just makes it available. + if self.browser_params.tmpdir is not None: + self.logger.debug( + "BROWSER %i: leftover temp directory %s? Deleting it." % + (self.browser_params.browser_id, self.browser_params.tmpdir) + ) + try: + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.debug( + "BROWSER %i: error deleting %s: %s." % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) self.browser_params.tmpdir = Path(tempfile.mkdtemp( prefix="openwpm_", dir=os.getenv('TMPDIR', default='/tmp') @@ -352,6 +366,7 @@ def close_browser_manager(self, force: bool = False) -> None: self.browser_params.tmpdir) ) shutil.rmtree(self.browser_params.tmpdir) + self.browser_params.tmpdir = None except Exception as e: self.logger.warn( "BROWSER %i: failed to delete temp dir %s: %s" % diff --git a/openwpm/config.py b/openwpm/config.py index 5471aeda8..4790ae091 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -141,16 +141,16 @@ class BrowserParams(DataClassJsonMixin): """ - tmpdir: Path = field( - default=Path(tempfile.gettempdir()), + tmpdir: Optional[Path] = field( + default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) """ - The temporary directory used by `geckodriver`. This is confiured when the - browser is deployed in deploy_browsers.deploy_firefox. We need it in order - to delete it when the browser is closed down, because `geckodriver` doesn't - appear to delete temporary files that it creates, such as a copy of the - extension XPI file. + The temporary directory used by `geckodriver`. This is configured in + `BrowserManager.run` and then deleted when the browser is finished. We do + this because it seems that `geckodriver` doesn't clean up its temporary + files (in particular, a copy of the extension XPI file), so we need to do + so ourselves. """ recovery_tar: Optional[Path] = None diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index f64096541..00eb8fc0d 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -143,8 +143,10 @@ def deploy_firefox( # Use browser_params.tmpdir as the temporary directory. This is so that # geckodriver makes its copy of the extension XPI file in tmpdir, so - # we can delete it later and not have it left behind. - env = os.environ + # we can delete it later and not have it left behind. I make a shallow + # copy of `os.environ` because I'm a little nervous about modifying the + # OpenWPM process' environment. + env = os.environ.copy() env['TMPDIR'] = str(browser_params.tmpdir) fo.binary_location = firefox_binary_path From 2e09752c0f3cc4387ef348836ada36710f6969ac Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Tue, 2 Jul 2024 11:38:26 -0400 Subject: [PATCH 06/15] Moved temporary directory deletion to finally block in BrowserManagerHandle.close_browser_manager. --- openwpm/browser_manager.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index b7e161725..1e6ac53d0 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -358,22 +358,6 @@ def close_browser_manager(self, force: bool = False) -> None: ) return - # Delete the temporary directory used by geckodriver. - try: - self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) - ) - shutil.rmtree(self.browser_params.tmpdir) - self.browser_params.tmpdir = None - except Exception as e: - self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir, - str(e)) - ) self.logger.debug( "BROWSER %i: Browser manager closed successfully." % self.browser_id ) @@ -382,6 +366,23 @@ def close_browser_manager(self, force: bool = False) -> None: if not shutdown_complete: self.kill_browser_manager() + # Delete the temporary directory used by geckodriver. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + self.browser_params.tmpdir = None + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) + def execute_command_sequence( self, # Quoting to break cyclic import, see https://stackoverflow.com/a/39757388 From bc0d6ae7b244da78e405b9ac032464fd83677ea4 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Wed, 3 Jul 2024 12:19:38 -0400 Subject: [PATCH 07/15] Reformatted with `black` as per `CONTRIBUTING.md`. --- openwpm/browser_manager.py | 36 ++++++++++++----------- openwpm/deploy_browsers/deploy_firefox.py | 4 +-- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 1e6ac53d0..781ac9f71 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -136,25 +136,26 @@ def launch_browser_manager(self) -> bool: # use `tmpdir`, it just makes it available. if self.browser_params.tmpdir is not None: self.logger.debug( - "BROWSER %i: leftover temp directory %s? Deleting it." % - (self.browser_params.browser_id, self.browser_params.tmpdir) + "BROWSER %i: leftover temp directory %s? Deleting it." + % (self.browser_params.browser_id, self.browser_params.tmpdir) ) try: shutil.rmtree(self.browser_params.tmpdir) except Exception as e: self.logger.debug( - "BROWSER %i: error deleting %s: %s." % - (self.browser_params.browser_id, + "BROWSER %i: error deleting %s: %s." + % ( + self.browser_params.browser_id, self.browser_params.tmpdir, - str(e)) + str(e), + ) ) - self.browser_params.tmpdir = Path(tempfile.mkdtemp( - prefix="openwpm_", - dir=os.getenv('TMPDIR', default='/tmp') - )) + self.browser_params.tmpdir = Path( + tempfile.mkdtemp(prefix="openwpm_", dir=os.getenv("TMPDIR", default="/tmp")) + ) self.logger.debug( - "BROWSER %i: Using temp dir %s" % - (self.browser_params.browser_id, self.browser_params.tmpdir) + "BROWSER %i: Using temp dir %s" + % (self.browser_params.browser_id, self.browser_params.tmpdir) ) self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) @@ -369,18 +370,19 @@ def close_browser_manager(self, force: bool = False) -> None: # Delete the temporary directory used by geckodriver. try: self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) + "BROWSER %i: deleting temp dir %s" + % (self.browser_params.browser_id, self.browser_params.tmpdir) ) shutil.rmtree(self.browser_params.tmpdir) self.browser_params.tmpdir = None except Exception as e: self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, + "BROWSER %i: failed to delete temp dir %s: %s" + % ( + self.browser_params.browser_id, self.browser_params.tmpdir, - str(e)) + str(e), + ) ) def execute_command_sequence( diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 00eb8fc0d..d8a1e4cd5 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -147,7 +147,7 @@ def deploy_firefox( # copy of `os.environ` because I'm a little nervous about modifying the # OpenWPM process' environment. env = os.environ.copy() - env['TMPDIR'] = str(browser_params.tmpdir) + env["TMPDIR"] = str(browser_params.tmpdir) fo.binary_location = firefox_binary_path geckodriver_path = subprocess.check_output( @@ -158,7 +158,7 @@ def deploy_firefox( service=Service( executable_path=geckodriver_path, log_output=open(webdriver_interceptor.fifo, "w"), - env=env + env=env, ), ) From f98285c6bf64d2ff8a52a4694a9a67c4f733e548 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Mon, 10 Jun 2024 15:56:16 -0400 Subject: [PATCH 08/15] Set TMPDIR in the environment to a unique directory for each browser instance and delete it when the browser quits. This is a workaround for an issue with geckodriver. When the OpenWPM extension is installed via `WebDriver.install_addon()`, geckodriver makes a copy of the XPI file in TMPDIR. However, geckodriver never deletes that file. So on a stateless crawl, you end up with one copy of the XPI file for each site visited. This workaround sets TMPDIR in the environment before creating the geckodriver service, and then deletes the directory after `driver.quit()` returns in `BrowserManager.run()`. We use this indirection because we don't have access to the name of the temporary file, and it doesn't seem safe to just delete XPI files in /tmp willy-nilly. --- openwpm/browser_manager.py | 15 +++++++++++++++ openwpm/config.py | 13 +++++++++++++ openwpm/deploy_browsers/deploy_firefox.py | 14 ++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index d9c65034d..0368f679c 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -764,6 +764,21 @@ def run(self) -> None: if isinstance(command, ShutdownSignal): driver.quit() + # Delete the temporary directory used by this browser. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) self.status_queue.put("OK") return diff --git a/openwpm/config.py b/openwpm/config.py index 1898d3901..5471aeda8 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -104,6 +104,7 @@ class BrowserParams(DataClassJsonMixin): default=Path(tempfile.gettempdir()), metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) + """ The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated browser profiles and residual files are stored. @@ -140,6 +141,18 @@ class BrowserParams(DataClassJsonMixin): """ + tmpdir: Path = field( + default=Path(tempfile.gettempdir()), + metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), + ) + """ + The temporary directory used by `geckodriver`. This is confiured when the + browser is deployed in deploy_browsers.deploy_firefox. We need it in order + to delete it when the browser is closed down, because `geckodriver` doesn't + appear to delete temporary files that it creates, such as a copy of the + extension XPI file. + """ + recovery_tar: Optional[Path] = None donottrack: bool = False tracking_protection: bool = False diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 5e0647065..12e5c4c93 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -138,6 +138,19 @@ def deploy_firefox( ) fo.set_preference(name, value) + # Create a temporary directory for this instance of geckodriver that + # we can delete later. + env = os.environ + browser_params.tmpdir = tempfile.mkdtemp( + prefix="openwpm_", + dir=os.getenv('TMPDIR', default='/tmp') + ) + env['TMPDIR'] = browser_params.tmpdir + logger.debug( + "BROWSER %i: Using temp dir %s" % + (browser_params.browser_id, browser_params.tmpdir) + ) + # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) @@ -150,6 +163,7 @@ def deploy_firefox( service=Service( executable_path=geckodriver_path, log_output=open(webdriver_interceptor.fifo, "w"), + env=env ), ) From 9f033e2a39f87174543b2359798c248809a80f9b Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Tue, 11 Jun 2024 15:53:50 -0400 Subject: [PATCH 09/15] Reorganized temporary directory management. - Temporary directory is created in `BrowserManagerHandle.launch_browser_manager`. - Temporary directory is deleted in `BrowserManagerHandle.shutdown_browser`. - Temporary directory is added to environment for `selenium.webdriver.firefox.service.Service` in `deploy_browsers.deploy_firefox() --- openwpm/browser_manager.py | 43 +++++++++++++++-------- openwpm/deploy_browsers/deploy_firefox.py | 19 ++++------ scripts/install-firefox.sh | 3 +- 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 0368f679c..60b87e572 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -131,6 +131,18 @@ def launch_browser_manager(self) -> bool: crash_recovery = True + # Create a unique temporary directory that we can delete + # when we shut down. Note that this doesn't force anything to + # use `tmpdir`, it just makes it available. + self.browser_params.tmpdir = Path(tempfile.mkdtemp( + prefix="openwpm_", + dir=os.getenv('TMPDIR', default='/tmp') + )) + self.logger.debug( + "BROWSER %i: Using temp dir %s" % + (self.browser_params.browser_id, self.browser_params.tmpdir) + ) + self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) self.is_fresh = not crash_recovery @@ -640,6 +652,22 @@ def shutdown_browser(self, during_init: bool, force: bool = False) -> None: if self.current_profile_path is not None: shutil.rmtree(self.current_profile_path, ignore_errors=True) + # Delete the temporary directory used by geckodriver. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) + class BrowserManager(Process): """ @@ -764,21 +792,6 @@ def run(self) -> None: if isinstance(command, ShutdownSignal): driver.quit() - # Delete the temporary directory used by this browser. - try: - self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) - ) - shutil.rmtree(self.browser_params.tmpdir) - except Exception as e: - self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir, - str(e)) - ) self.status_queue.put("OK") return diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 12e5c4c93..f64096541 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -138,22 +138,15 @@ def deploy_firefox( ) fo.set_preference(name, value) - # Create a temporary directory for this instance of geckodriver that - # we can delete later. - env = os.environ - browser_params.tmpdir = tempfile.mkdtemp( - prefix="openwpm_", - dir=os.getenv('TMPDIR', default='/tmp') - ) - env['TMPDIR'] = browser_params.tmpdir - logger.debug( - "BROWSER %i: Using temp dir %s" % - (browser_params.browser_id, browser_params.tmpdir) - ) - # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) + # Use browser_params.tmpdir as the temporary directory. This is so that + # geckodriver makes its copy of the extension XPI file in tmpdir, so + # we can delete it later and not have it left behind. + env = os.environ + env['TMPDIR'] = str(browser_params.tmpdir) + fo.binary_location = firefox_binary_path geckodriver_path = subprocess.check_output( "which geckodriver", encoding="utf-8", shell=True diff --git a/scripts/install-firefox.sh b/scripts/install-firefox.sh index 363f8f27e..822b9cc51 100755 --- a/scripts/install-firefox.sh +++ b/scripts/install-firefox.sh @@ -9,7 +9,8 @@ set -e # Note this script is **destructive** and will # remove the existing Firefox in the OpenWPM directory -TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE +# TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE +TAG='6c033deedc28e5dadb0b99de7336cb6ebb336631' # FIREFOX_126_0_1_RELEASE case "$(uname -s)" in Darwin) From 492f3e05e6fa2aae8800be1dafd1b547b2f99474 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Tue, 11 Jun 2024 16:03:44 -0400 Subject: [PATCH 10/15] Undid accidental adding of local mod to install-firefox.sh. --- scripts/install-firefox.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/install-firefox.sh b/scripts/install-firefox.sh index 822b9cc51..363f8f27e 100755 --- a/scripts/install-firefox.sh +++ b/scripts/install-firefox.sh @@ -9,8 +9,7 @@ set -e # Note this script is **destructive** and will # remove the existing Firefox in the OpenWPM directory -# TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE -TAG='6c033deedc28e5dadb0b99de7336cb6ebb336631' # FIREFOX_126_0_1_RELEASE +TAG='d3c71a6fc9a1aecf1fe04f8de2fc0b816588e677' # FIREFOX_123_0_RELEASE case "$(uname -s)" in Darwin) From c484cad7f4779bf8993530ce275e34e53713d431 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Thu, 13 Jun 2024 10:03:17 -0400 Subject: [PATCH 11/15] Moved temporary directory cleanup to `BrowserManagerHandle.close_browser_manager`. --- openwpm/browser_manager.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 60b87e572..478ec09f6 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -344,9 +344,24 @@ def close_browser_manager(self, force: bool = False) -> None: ) return - self.logger.debug( - "BROWSER %i: Browser manager closed successfully." % self.browser_id - ) + # Delete the temporary directory used by geckodriver. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) + self.logger.debug( + "BROWSER %i: Browser manager closed successfully." % self.browser_id + ) shutdown_complete = True finally: if not shutdown_complete: @@ -652,22 +667,6 @@ def shutdown_browser(self, during_init: bool, force: bool = False) -> None: if self.current_profile_path is not None: shutil.rmtree(self.current_profile_path, ignore_errors=True) - # Delete the temporary directory used by geckodriver. - try: - self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) - ) - shutil.rmtree(self.browser_params.tmpdir) - except Exception as e: - self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir, - str(e)) - ) - class BrowserManager(Process): """ From 3defbf86be40d3c6d8e407554cb15e80f151f18e Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Wed, 26 Jun 2024 14:34:40 -0400 Subject: [PATCH 12/15] Delete leftover temporary directory. `BrowserParams.tmpdir` is now `Optional[Path]`. Value `None` means no temporary directory has been set, `Some p` means it has been set to `p`. When the browser manager is launched, if `tmpdir` is not `None`, try to delete it, assuming that it is leftover from some failure that prevented normal deletion. Regardless, then make a new temporary directory. Make sure to set it to `None` when the temporary directory is deleted during normal cleanup. --- openwpm/browser_manager.py | 15 +++++++++++++++ openwpm/config.py | 14 +++++++------- openwpm/deploy_browsers/deploy_firefox.py | 6 ++++-- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 478ec09f6..b7e161725 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -134,6 +134,20 @@ def launch_browser_manager(self) -> bool: # Create a unique temporary directory that we can delete # when we shut down. Note that this doesn't force anything to # use `tmpdir`, it just makes it available. + if self.browser_params.tmpdir is not None: + self.logger.debug( + "BROWSER %i: leftover temp directory %s? Deleting it." % + (self.browser_params.browser_id, self.browser_params.tmpdir) + ) + try: + shutil.rmtree(self.browser_params.tmpdir) + except Exception as e: + self.logger.debug( + "BROWSER %i: error deleting %s: %s." % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) self.browser_params.tmpdir = Path(tempfile.mkdtemp( prefix="openwpm_", dir=os.getenv('TMPDIR', default='/tmp') @@ -352,6 +366,7 @@ def close_browser_manager(self, force: bool = False) -> None: self.browser_params.tmpdir) ) shutil.rmtree(self.browser_params.tmpdir) + self.browser_params.tmpdir = None except Exception as e: self.logger.warn( "BROWSER %i: failed to delete temp dir %s: %s" % diff --git a/openwpm/config.py b/openwpm/config.py index 5471aeda8..4790ae091 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -141,16 +141,16 @@ class BrowserParams(DataClassJsonMixin): """ - tmpdir: Path = field( - default=Path(tempfile.gettempdir()), + tmpdir: Optional[Path] = field( + default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) """ - The temporary directory used by `geckodriver`. This is confiured when the - browser is deployed in deploy_browsers.deploy_firefox. We need it in order - to delete it when the browser is closed down, because `geckodriver` doesn't - appear to delete temporary files that it creates, such as a copy of the - extension XPI file. + The temporary directory used by `geckodriver`. This is configured in + `BrowserManager.run` and then deleted when the browser is finished. We do + this because it seems that `geckodriver` doesn't clean up its temporary + files (in particular, a copy of the extension XPI file), so we need to do + so ourselves. """ recovery_tar: Optional[Path] = None diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index f64096541..00eb8fc0d 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -143,8 +143,10 @@ def deploy_firefox( # Use browser_params.tmpdir as the temporary directory. This is so that # geckodriver makes its copy of the extension XPI file in tmpdir, so - # we can delete it later and not have it left behind. - env = os.environ + # we can delete it later and not have it left behind. I make a shallow + # copy of `os.environ` because I'm a little nervous about modifying the + # OpenWPM process' environment. + env = os.environ.copy() env['TMPDIR'] = str(browser_params.tmpdir) fo.binary_location = firefox_binary_path From 2bcfca7f4820ee89b9293eb0754ded5fb7539b0b Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Tue, 2 Jul 2024 11:38:26 -0400 Subject: [PATCH 13/15] Moved temporary directory deletion to finally block in BrowserManagerHandle.close_browser_manager. --- openwpm/browser_manager.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index b7e161725..1e6ac53d0 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -358,22 +358,6 @@ def close_browser_manager(self, force: bool = False) -> None: ) return - # Delete the temporary directory used by geckodriver. - try: - self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) - ) - shutil.rmtree(self.browser_params.tmpdir) - self.browser_params.tmpdir = None - except Exception as e: - self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir, - str(e)) - ) self.logger.debug( "BROWSER %i: Browser manager closed successfully." % self.browser_id ) @@ -382,6 +366,23 @@ def close_browser_manager(self, force: bool = False) -> None: if not shutdown_complete: self.kill_browser_manager() + # Delete the temporary directory used by geckodriver. + try: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + self.browser_params.tmpdir = None + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s: %s" % + (self.browser_params.browser_id, + self.browser_params.tmpdir, + str(e)) + ) + def execute_command_sequence( self, # Quoting to break cyclic import, see https://stackoverflow.com/a/39757388 From de1da0d5e7cfc2314fc4a711b2edc364f7eff72c Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Wed, 3 Jul 2024 12:19:38 -0400 Subject: [PATCH 14/15] Reformatted with `black` as per `CONTRIBUTING.md`. --- openwpm/browser_manager.py | 36 ++++++++++++----------- openwpm/deploy_browsers/deploy_firefox.py | 4 +-- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 1e6ac53d0..781ac9f71 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -136,25 +136,26 @@ def launch_browser_manager(self) -> bool: # use `tmpdir`, it just makes it available. if self.browser_params.tmpdir is not None: self.logger.debug( - "BROWSER %i: leftover temp directory %s? Deleting it." % - (self.browser_params.browser_id, self.browser_params.tmpdir) + "BROWSER %i: leftover temp directory %s? Deleting it." + % (self.browser_params.browser_id, self.browser_params.tmpdir) ) try: shutil.rmtree(self.browser_params.tmpdir) except Exception as e: self.logger.debug( - "BROWSER %i: error deleting %s: %s." % - (self.browser_params.browser_id, + "BROWSER %i: error deleting %s: %s." + % ( + self.browser_params.browser_id, self.browser_params.tmpdir, - str(e)) + str(e), + ) ) - self.browser_params.tmpdir = Path(tempfile.mkdtemp( - prefix="openwpm_", - dir=os.getenv('TMPDIR', default='/tmp') - )) + self.browser_params.tmpdir = Path( + tempfile.mkdtemp(prefix="openwpm_", dir=os.getenv("TMPDIR", default="/tmp")) + ) self.logger.debug( - "BROWSER %i: Using temp dir %s" % - (self.browser_params.browser_id, self.browser_params.tmpdir) + "BROWSER %i: Using temp dir %s" + % (self.browser_params.browser_id, self.browser_params.tmpdir) ) self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) @@ -369,18 +370,19 @@ def close_browser_manager(self, force: bool = False) -> None: # Delete the temporary directory used by geckodriver. try: self.logger.debug( - "BROWSER %i: deleting temp dir %s" % - (self.browser_params.browser_id, - self.browser_params.tmpdir) + "BROWSER %i: deleting temp dir %s" + % (self.browser_params.browser_id, self.browser_params.tmpdir) ) shutil.rmtree(self.browser_params.tmpdir) self.browser_params.tmpdir = None except Exception as e: self.logger.warn( - "BROWSER %i: failed to delete temp dir %s: %s" % - (self.browser_params.browser_id, + "BROWSER %i: failed to delete temp dir %s: %s" + % ( + self.browser_params.browser_id, self.browser_params.tmpdir, - str(e)) + str(e), + ) ) def execute_command_sequence( diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 00eb8fc0d..d8a1e4cd5 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -147,7 +147,7 @@ def deploy_firefox( # copy of `os.environ` because I'm a little nervous about modifying the # OpenWPM process' environment. env = os.environ.copy() - env['TMPDIR'] = str(browser_params.tmpdir) + env["TMPDIR"] = str(browser_params.tmpdir) fo.binary_location = firefox_binary_path geckodriver_path = subprocess.check_output( @@ -158,7 +158,7 @@ def deploy_firefox( service=Service( executable_path=geckodriver_path, log_output=open(webdriver_interceptor.fifo, "w"), - env=env + env=env, ), ) From d3c2e2df4cb16e5774994b10aa5c40e409376fe9 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Tue, 23 Jul 2024 09:27:00 -0400 Subject: [PATCH 15/15] Fixed mypy errors for everything but the instantiation of `webdriver.Firefox` in `deploy_browsers/deploy_firefox.py`. --- openwpm/browser_manager.py | 21 +++++++++++---------- openwpm/deploy_browsers/deploy_firefox.py | 6 +++--- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 781ac9f71..83ee7bc96 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -137,7 +137,7 @@ def launch_browser_manager(self) -> bool: if self.browser_params.tmpdir is not None: self.logger.debug( "BROWSER %i: leftover temp directory %s? Deleting it." - % (self.browser_params.browser_id, self.browser_params.tmpdir) + % (self.browser_id, self.browser_params.tmpdir) ) try: shutil.rmtree(self.browser_params.tmpdir) @@ -145,7 +145,7 @@ def launch_browser_manager(self) -> bool: self.logger.debug( "BROWSER %i: error deleting %s: %s." % ( - self.browser_params.browser_id, + self.browser_id, self.browser_params.tmpdir, str(e), ) @@ -155,7 +155,7 @@ def launch_browser_manager(self) -> bool: ) self.logger.debug( "BROWSER %i: Using temp dir %s" - % (self.browser_params.browser_id, self.browser_params.tmpdir) + % (self.browser_id, self.browser_params.tmpdir) ) self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) @@ -369,17 +369,18 @@ def close_browser_manager(self, force: bool = False) -> None: # Delete the temporary directory used by geckodriver. try: - self.logger.debug( - "BROWSER %i: deleting temp dir %s" - % (self.browser_params.browser_id, self.browser_params.tmpdir) - ) - shutil.rmtree(self.browser_params.tmpdir) - self.browser_params.tmpdir = None + if self.browser_params.tmpdir is not None: + self.logger.debug( + "BROWSER %i: deleting temp dir %s" + % (self.browser_id, self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + self.browser_params.tmpdir = None except Exception as e: self.logger.warn( "BROWSER %i: failed to delete temp dir %s: %s" % ( - self.browser_params.browser_id, + self.browser_id, self.browser_params.tmpdir, str(e), ) diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index d8a1e4cd5..f51a54678 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -143,9 +143,9 @@ def deploy_firefox( # Use browser_params.tmpdir as the temporary directory. This is so that # geckodriver makes its copy of the extension XPI file in tmpdir, so - # we can delete it later and not have it left behind. I make a shallow - # copy of `os.environ` because I'm a little nervous about modifying the - # OpenWPM process' environment. + # we can delete it later and not have it left behind. I make a copy of + # `os.environ` because I'm a little nervous about modifying the OpenWPM + # process' environment. env = os.environ.copy() env["TMPDIR"] = str(browser_params.tmpdir)