diff --git a/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/.dockstore.yml b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/.dockstore.yml new file mode 100644 index 000000000..cca6a7e03 --- /dev/null +++ b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/.dockstore.yml @@ -0,0 +1,15 @@ +version: 1.2 +workflows: +- name: main + subclass: Galaxy + publish: true + primaryDescriptorPath: /sra-manifest-to-concatenated-fastqs.ga + testParameterFiles: + - /sra-manifest-to-concatenated-fastqs-tests.yml + authors: + - name: Lucille Delisle + orcid: 0000-0002-1964-4960 + - name: Pierre Osteil + orcid: 0000-0002-5832-6703 + - name: Wolfgang Maier + orcid: 0000-0002-9464-6640 diff --git a/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/.workflowhub.yml b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/.workflowhub.yml new file mode 100644 index 000000000..012dba802 --- /dev/null +++ b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/.workflowhub.yml @@ -0,0 +1,5 @@ +version: '0.1' +registries: +- url: https://workflowhub.eu + project: iwc + workflow: sra-manifest-to-concatenated-fastqs/main diff --git a/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/CHANGELOG.md b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/CHANGELOG.md new file mode 100644 index 000000000..ea9a6984d --- /dev/null +++ b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/CHANGELOG.md @@ -0,0 +1,4 @@ +# Changelog + +## [0.1] 2023-10-23 +First release. diff --git a/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/README.md b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/README.md new file mode 100644 index 000000000..394aec6c1 --- /dev/null +++ b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/README.md @@ -0,0 +1,35 @@ +# SRA manifest to concatenated fastqs + +This workflow takes as input a SRA manifest from SRA Run Selector (or a tabular with a header line), downloads all sequencing run data from the SRA and arranges it into per-sample fastq or pairs of fastq datasets. + +It will work out the relationship between runs and samples from the user-indicated run and sample columns in the input and will concatenate sequencing run data as needed to obtain per-sample datasets. + +## Input dataset + +- The workflow needs a single tabular input dataset, which is supposed to list SRA run identifiers in one column and sample names in another, and which needs to have a header line. +- SRA manifests obtained via the SRA Run Selector and turned into tabular format represent valid input. + +## Input values + +- Column number with SRA run ID + + For manifests obtained through the SRA Run Selector this is column 1 + +- Column number with sample names + + The number of the column that should be used to assign sequencing runs to samples + The names in the column will also serve as the labels of datasets in the output collection. + For manifests obtained through the SRA Run Selector suitable columns might be number 6 (BioSample), 16 (Experiment) or 36 (Sample Name). + +## Processing + +- The workflow downloads sequencing run data in fastq format with fasterqdump (one job per SRA run ID). +- Run data gets concatenated if it comes from the same sample. + +## Outputs + +- There are 2 outputs, one with paired-end datasets, one with single-read datasets. + +## Limitations + +- Special characters in sample names (anything that is not an English alphabet character, digit, underscore, dash, space, dot or comma (`[a-zA-Z0-9_\- \.,]`) will be converted to dashes (`-`). diff --git a/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/sra-manifest-to-concatenated-fastqs-tests.yml b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/sra-manifest-to-concatenated-fastqs-tests.yml new file mode 100644 index 000000000..63c795764 --- /dev/null +++ b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/sra-manifest-to-concatenated-fastqs-tests.yml @@ -0,0 +1,46 @@ +- doc: Test for sra-list-to-concatenated-fastqs.ga + job: + SRA_manifest: + class: File + path: test-data/SRA.txt + Column number with SRA ID: 1 + Column number with final identifier: 22 + outputs: + paired_output: + element_tests: + GSM461177-: + element_tests: + forward: + asserts: + has_size: + value: 294000000 + delta: 30000000 + reverse: + asserts: + has_size: + value: 307000000 + delta: 30000000 + GSM461178___a: + element_tests: + forward: + asserts: + has_size: + value: 178000000 + delta: 10000000 + reverse: + asserts: + has_size: + value: 205000000 + delta: 20000000 + single_output: + element_tests: + GSM461176.-: + asserts: + has_size: + value: 139000000 + delta: 10000000 + GSM461179-ID: + asserts: + has_size: + value: 298000000 + delta: 30000000 diff --git a/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/sra-manifest-to-concatenated-fastqs.ga b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/sra-manifest-to-concatenated-fastqs.ga new file mode 100644 index 000000000..3ec3feefa --- /dev/null +++ b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/sra-manifest-to-concatenated-fastqs.ga @@ -0,0 +1,712 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "This workflow takes as input a SRA_manifest from SRA Run Selector and will generate one fastq file or fastq pair of file for each experiment (concatenated multiple runs if necessary). Output will be relabelled to match the column specified by the user.", + "creator": [ + { + "class": "Person", + "identifier": "https://orcid.org/0000-0002-1964-4960", + "name": "Lucille Delisle" + }, + { + "class": "Person", + "identifier": "https://orcid.org/0000-0002-5832-6703", + "name": "Pierre Osteil" + }, + { + "class": "Person", + "identifier": "https://orcid.org/0000-0002-9464-6640", + "name": "Wolfgang Maier" + } + ], + "format-version": "0.1", + "release": "0.1", + "license": "MIT", + "name": "sra_manifest_to_concatenated_fastqs_parallel", + "steps": { + "0": { + "annotation": "Input tabular from SRA Run Selector or home made (First row needs to be a header)", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "Input tabular from SRA Run Selector or home made (First row needs to be a header)", + "name": "SRA_manifest" + } + ], + "label": "SRA_manifest", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0, + "top": 44.5 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "7125f877-571a-4f10-b9de-48b750cf7964", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "Column number in the SRA_manifest with SRA ID (usually 1)", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "Column number in the SRA_manifest with SRA ID (usually 1)", + "name": "Column number with SRA ID" + } + ], + "label": "Column number with SRA ID", + "name": "Input parameter", + "outputs": [], + "position": { + "left": 23, + "top": 145 + }, + "tool_id": null, + "tool_state": "{\"parameter_type\": \"integer\", \"optional\": false}", + "tool_version": null, + "type": "parameter_input", + "uuid": "de844377-af66-4f4e-bc67-327e9705d5b5", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "Column number in the SRA_list with final identifier", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "Column number in the SRA_list with final identifier", + "name": "Column number with final identifier" + } + ], + "label": "Column number with final identifier", + "name": "Input parameter", + "outputs": [], + "position": { + "left": 59, + "top": 243 + }, + "tool_id": null, + "tool_state": "{\"parameter_type\": \"integer\", \"optional\": false}", + "tool_version": null, + "type": "parameter_input", + "uuid": "3f0fca4f-b920-4ae6-bda8-8ef61ebbf409", + "when": null, + "workflow_outputs": [] + }, + "3": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/map_param_value/map_param_value/0.1.1", + "errors": null, + "id": 3, + "input_connections": { + "input_param_type|input_param": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [], + "label": "Set SRA column to 1 if at 0", + "name": "Map parameter value", + "outputs": [ + { + "name": "output_param_integer", + "type": "expression.json" + } + ], + "position": { + "left": 305, + "top": 152.5 + }, + "post_job_actions": { + "HideDatasetActionoutput_param_integer": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output_param_integer" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/map_param_value/map_param_value/0.1.1", + "tool_shed_repository": { + "changeset_revision": "a01f088d0e5e", + "name": "map_param_value", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"input_param_type\": {\"type\": \"integer\", \"__current_case__\": 1, \"input_param\": {\"__class__\": \"ConnectedValue\"}, \"mappings\": [{\"__index__\": 0, \"from\": \"0\", \"to\": \"1\"}]}, \"output_param_type\": \"integer\", \"unmapped\": {\"on_unmapped\": \"input\", \"__current_case__\": 0}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.1.1", + "type": "tool", + "uuid": "98c1bbbc-35f3-4a9e-ad2d-1a8420b827b5", + "when": null, + "workflow_outputs": [] + }, + "4": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/compose_text_param/compose_text_param/0.1.1", + "errors": null, + "id": 4, + "input_connections": { + "components_1|param_type|component_value": { + "id": 3, + "output_name": "output_param_integer" + }, + "components_3|param_type|component_value": { + "id": 2, + "output_name": "output" + } + }, + "inputs": [], + "label": "Compute column expression", + "name": "Compose text parameter value", + "outputs": [ + { + "name": "out1", + "type": "expression.json" + } + ], + "position": { + "left": 527, + "top": 227 + }, + "post_job_actions": { + "HideDatasetActionout1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out1" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/compose_text_param/compose_text_param/0.1.1", + "tool_shed_repository": { + "changeset_revision": "e188c9826e0f", + "name": "compose_text_param", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"components\": [{\"__index__\": 0, \"param_type\": {\"select_param_type\": \"text\", \"__current_case__\": 0, \"component_value\": \"c\"}}, {\"__index__\": 1, \"param_type\": {\"select_param_type\": \"integer\", \"__current_case__\": 1, \"component_value\": {\"__class__\": \"ConnectedValue\"}}}, {\"__index__\": 2, \"param_type\": {\"select_param_type\": \"text\", \"__current_case__\": 0, \"component_value\": \",c\"}}, {\"__index__\": 3, \"param_type\": {\"select_param_type\": \"integer\", \"__current_case__\": 1, \"component_value\": {\"__class__\": \"ConnectedValue\"}}}], \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.1.1", + "type": "tool", + "uuid": "f93a3f4b-c1f8-45f7-b44f-b42fe0aa1856", + "when": null, + "workflow_outputs": [] + }, + "5": { + "annotation": "", + "content_id": "Cut1", + "errors": null, + "id": 5, + "input_connections": { + "columnList": { + "id": 4, + "output_name": "out1" + }, + "input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "label": "Cut columns of interest", + "name": "Cut", + "outputs": [ + { + "name": "out_file1", + "type": "tabular" + } + ], + "position": { + "left": 816, + "top": 0 + }, + "post_job_actions": {}, + "tool_id": "Cut1", + "tool_state": "{\"columnList\": {\"__class__\": \"ConnectedValue\"}, \"delimiter\": \"T\", \"input\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.2", + "type": "tool", + "uuid": "dc7f7cb5-cc22-4ec8-8fe9-025f8f265e31", + "when": null, + "workflow_outputs": [] + }, + "6": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/bgruening/text_processing/tp_find_and_replace/1.1.4", + "errors": null, + "id": 6, + "input_connections": { + "infile": { + "id": 5, + "output_name": "out_file1" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Replace", + "name": "infile" + } + ], + "label": "generate table for relabelling", + "name": "Replace", + "outputs": [ + { + "name": "outfile", + "type": "input" + } + ], + "position": { + "left": 1096, + "top": 189 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bgruening/text_processing/tp_find_and_replace/1.1.4", + "tool_shed_repository": { + "changeset_revision": "d698c222f354", + "name": "text_processing", + "owner": "bgruening", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"find_and_replace\": [{\"__index__\": 0, \"find_pattern\": \"[^\\\\w\\\\- .,\\\\t](?=[^\\\\t]+$)\", \"replace_pattern\": \"-\", \"is_regex\": true, \"global\": true, \"caseinsensitive\": false, \"wholewords\": false, \"skip_first_line\": false, \"searchwhere\": {\"searchwhere_select\": \"line\", \"__current_case__\": 0}}, {\"__index__\": 1, \"find_pattern\": \"(.+)\\\\t(.+)\", \"replace_pattern\": \"$1\\\\t$1___$2\", \"is_regex\": true, \"global\": false, \"caseinsensitive\": false, \"wholewords\": false, \"skip_first_line\": false, \"searchwhere\": {\"searchwhere_select\": \"line\", \"__current_case__\": 0}}], \"infile\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.1.4", + "type": "tool", + "uuid": "7b4781ed-57b0-4962-a647-21e54375c195", + "when": null, + "workflow_outputs": [] + }, + "7": { + "annotation": "", + "content_id": "Cut1", + "errors": null, + "id": 7, + "input_connections": { + "input": { + "id": 5, + "output_name": "out_file1" + } + }, + "inputs": [], + "label": "Cut to get only SRA", + "name": "Cut", + "outputs": [ + { + "name": "out_file1", + "type": "tabular" + } + ], + "position": { + "left": 1347.75, + "top": 4.48333740234375 + }, + "post_job_actions": { + "HideDatasetActionout_file1": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "out_file1" + } + }, + "tool_id": "Cut1", + "tool_state": "{\"__input_ext\": \"tabular\", \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"columnList\": \"c1\", \"delimiter\": \"T\", \"input\": {\"__class__\": \"ConnectedValue\"}, \"input|__identifier__\": \"SRR031709\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.2", + "type": "tool", + "uuid": "f0405bd8-19f0-4a3d-afe9-3cfd1a0f01a3", + "when": null, + "workflow_outputs": [] + }, + "8": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/bgruening/split_file_to_collection/split_file_to_collection/0.5.0", + "errors": null, + "id": 8, + "input_connections": { + "split_parms|input": { + "id": 7, + "output_name": "out_file1" + } + }, + "inputs": [], + "label": "split file to get one SRA per file + header", + "name": "Split file", + "outputs": [ + { + "name": "list_output_tab", + "type": "input" + } + ], + "position": { + "left": 1645, + "top": 155.5 + }, + "post_job_actions": { + "HideDatasetActionlist_output_tab": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "list_output_tab" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bgruening/split_file_to_collection/split_file_to_collection/0.5.0", + "tool_shed_repository": { + "changeset_revision": "6cbe2f30c2d7", + "name": "split_file_to_collection", + "owner": "bgruening", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"split_parms\": {\"select_ftype\": \"tabular\", \"__current_case__\": 0, \"input\": {\"__class__\": \"ConnectedValue\"}, \"top\": \"1\", \"split_by\": {\"select_split_by\": \"col\", \"__current_case__\": 0, \"id_col\": \"1\", \"match_regex\": \"(.*)\", \"sub_regex\": \"\\\\1\"}}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.5.0", + "type": "tool", + "uuid": "4a1ad675-4291-4001-a748-d009299f9d40", + "when": null, + "workflow_outputs": [] + }, + "9": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/fasterq_dump/3.0.5+galaxy3", + "errors": null, + "id": 9, + "input_connections": { + "input|file_list": { + "id": 8, + "output_name": "list_output_tab" + } + }, + "inputs": [], + "label": "get Fastqs from SRA IDs", + "name": "Faster Download and Extract Reads in FASTQ", + "outputs": [ + { + "name": "list_paired", + "type": "input" + }, + { + "name": "output_collection", + "type": "input" + }, + { + "name": "output_collection_other", + "type": "input" + }, + { + "name": "log", + "type": "txt" + } + ], + "position": { + "left": 1920, + "top": 435.5 + }, + "post_job_actions": { + "HideDatasetActionlist_paired": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "list_paired" + }, + "HideDatasetActionlog": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "log" + }, + "HideDatasetActionoutput_collection": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output_collection" + }, + "HideDatasetActionoutput_collection_other": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output_collection_other" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/fasterq_dump/3.0.5+galaxy3", + "tool_shed_repository": { + "changeset_revision": "734abc7ac21d", + "name": "sra_tools", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"adv\": {\"seq_defline\": \"@$sn/$ri\", \"minlen\": null, \"split\": \"--split-3\", \"skip_technical\": true}, \"input\": {\"input_select\": \"file_list\", \"__current_case__\": 2, \"file_list\": {\"__class__\": \"ConnectedValue\"}}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "3.0.5+galaxy3", + "type": "tool", + "uuid": "fa409017-1eea-4426-83f6-22a10b9f762c", + "when": null, + "workflow_outputs": [] + }, + "10": { + "annotation": "", + "content_id": "__RELABEL_FROM_FILE__", + "errors": null, + "id": 10, + "input_connections": { + "how|labels": { + "id": 6, + "output_name": "outfile" + }, + "input": { + "id": 9, + "output_name": "list_paired" + } + }, + "inputs": [], + "label": "relabel pair collec to get SRA+sample", + "name": "Relabel identifiers", + "outputs": [ + { + "name": "output", + "type": "input" + } + ], + "position": { + "left": 2159, + "top": 329.5 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_id": "__RELABEL_FROM_FILE__", + "tool_state": "{\"how\": {\"how_select\": \"tabular\", \"__current_case__\": 1, \"labels\": {\"__class__\": \"ConnectedValue\"}, \"strict\": false}, \"input\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.0", + "type": "tool", + "uuid": "ccdc51b2-444e-4b96-90a0-c7caba6a53e8", + "when": null, + "workflow_outputs": [] + }, + "11": { + "annotation": "", + "content_id": "__RELABEL_FROM_FILE__", + "errors": null, + "id": 11, + "input_connections": { + "how|labels": { + "id": 6, + "output_name": "outfile" + }, + "input": { + "id": 9, + "output_name": "output_collection" + } + }, + "inputs": [], + "label": "relabel single collec to get SRA+sample", + "name": "Relabel identifiers", + "outputs": [ + { + "name": "output", + "type": "input" + } + ], + "position": { + "left": 2138, + "top": 718.5 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_id": "__RELABEL_FROM_FILE__", + "tool_state": "{\"how\": {\"how_select\": \"tabular\", \"__current_case__\": 1, \"labels\": {\"__class__\": \"ConnectedValue\"}, \"strict\": false}, \"input\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.0", + "type": "tool", + "uuid": "4efa4ed9-6768-4b1e-82f6-055e97af2d2d", + "when": null, + "workflow_outputs": [] + }, + "12": { + "annotation": "", + "content_id": "__APPLY_RULES__", + "errors": null, + "id": 12, + "input_connections": { + "input": { + "id": 10, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "Apply rules", + "outputs": [ + { + "name": "output", + "type": "input" + } + ], + "position": { + "left": 2390, + "top": 359.5 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_id": "__APPLY_RULES__", + "tool_state": "{\"input\": {\"__class__\": \"ConnectedValue\"}, \"rules\": {\"mapping\": [{\"columns\": [3, 1], \"editing\": false, \"type\": \"list_identifiers\"}, {\"columns\": [2], \"type\": \"paired_identifier\"}], \"rules\": [{\"error\": null, \"type\": \"add_column_metadata\", \"value\": \"identifier0\", \"warn\": null}, {\"error\": null, \"type\": \"add_column_metadata\", \"value\": \"identifier1\", \"warn\": null}, {\"error\": null, \"type\": \"add_column_metadata\", \"value\": \"identifier2\", \"warn\": null}, {\"error\": null, \"expression\": \"(.*?)___(.*)\", \"group_count\": null, \"replacement\": \"\\\\2\", \"target_column\": 0, \"type\": \"add_column_regex\", \"warn\": null}]}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.1.0", + "type": "tool", + "uuid": "9d65a5a8-dedb-4d63-9164-47ceefc746b3", + "when": null, + "workflow_outputs": [] + }, + "13": { + "annotation": "", + "content_id": "__APPLY_RULES__", + "errors": null, + "id": 13, + "input_connections": { + "input": { + "id": 11, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "Apply rules", + "outputs": [ + { + "name": "output", + "type": "input" + } + ], + "position": { + "left": 2364, + "top": 738.5 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + } + }, + "tool_id": "__APPLY_RULES__", + "tool_state": "{\"input\": {\"__class__\": \"ConnectedValue\"}, \"rules\": {\"mapping\": [{\"columns\": [2, 0], \"editing\": false, \"type\": \"list_identifiers\"}], \"rules\": [{\"error\": null, \"type\": \"add_column_metadata\", \"value\": \"identifier0\", \"warn\": null}, {\"error\": null, \"type\": \"add_column_metadata\", \"value\": \"identifier1\", \"warn\": null}, {\"error\": null, \"expression\": \"(.*?)___(.*)\", \"group_count\": null, \"replacement\": \"\\\\2\", \"target_column\": 0, \"type\": \"add_column_regex\", \"warn\": null}]}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.1.0", + "type": "tool", + "uuid": "3938066e-faa5-469e-96c0-d4ede1fb5f11", + "when": null, + "workflow_outputs": [] + }, + "14": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/artbio/concatenate_multiple_datasets/cat_multi_datasets/1.4.1", + "errors": null, + "id": 14, + "input_connections": { + "global_condition|inputs": { + "id": 12, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "Concatenate multiple datasets", + "outputs": [ + { + "name": "paired_output", + "type": "input" + } + ], + "position": { + "left": 2617, + "top": 347.5 + }, + "post_job_actions": { + "RenameDatasetActionpaired_output": { + "action_arguments": { + "newname": "Concatenated paired-end" + }, + "action_type": "RenameDatasetAction", + "output_name": "paired_output" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/artbio/concatenate_multiple_datasets/cat_multi_datasets/1.4.1", + "tool_shed_repository": { + "changeset_revision": "55cf9d9defd1", + "name": "concatenate_multiple_datasets", + "owner": "artbio", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"fastqsanger.gz\", \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"dataset_names\": false, \"global_condition\": {\"input_type\": \"paired_collection\", \"__current_case__\": 1, \"inputs\": {\"__class__\": \"ConnectedValue\"}, \"paired_cat_type\": \"by_strand\"}, \"headers\": \"0\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.4.1", + "type": "tool", + "uuid": "37b6fecb-1c87-4c41-8598-95d43b418ee0", + "when": null, + "workflow_outputs": [ + { + "label": "paired_output", + "output_name": "paired_output", + "uuid": "8abbbe2a-9b3e-4472-81ec-a8fbede8a044" + } + ] + }, + "15": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/artbio/concatenate_multiple_datasets/cat_multi_datasets/1.4.1", + "errors": null, + "id": 15, + "input_connections": { + "global_condition|inputs": { + "id": 13, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "Concatenate multiple datasets", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 2595, + "top": 719.5 + }, + "post_job_actions": { + "RenameDatasetActionout_file1": { + "action_arguments": { + "newname": "Concatenated Single-read" + }, + "action_type": "RenameDatasetAction", + "output_name": "out_file1" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/artbio/concatenate_multiple_datasets/cat_multi_datasets/1.4.1", + "tool_shed_repository": { + "changeset_revision": "55cf9d9defd1", + "name": "concatenate_multiple_datasets", + "owner": "artbio", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"fastqsanger.gz\", \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"dataset_names\": false, \"global_condition\": {\"input_type\": \"singles\", \"__current_case__\": 0, \"inputs\": {\"__class__\": \"ConnectedValue\"}}, \"headers\": \"0\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.4.1", + "type": "tool", + "uuid": "49a70885-3374-4680-bbd2-ad57b8be543d", + "when": null, + "workflow_outputs": [ + { + "label": "single_output", + "output_name": "out_file1", + "uuid": "71cbe11d-24ff-4e86-954f-acb439332d0a" + } + ] + } + }, + "tags": [], + "uuid": "363898af-e598-4f0e-abd9-e6ded395ce66", + "version": 3 +} \ No newline at end of file diff --git a/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/test-data/SRA.txt b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/test-data/SRA.txt new file mode 100644 index 000000000..791834c92 --- /dev/null +++ b/workflows/data-fetching/sra-manifest-to-concatenated-fastqs/test-data/SRA.txt @@ -0,0 +1,7 @@ +Run Assay Type AvgSpotLen Bases BioProject BioSample Bytes Center Name Consent DATASTORE filetype DATASTORE provider DATASTORE region Experiment Instrument Library Name LibraryLayout LibrarySelection LibrarySource Organism Platform ReleaseDate Sample Name SRA Study create_date version GEO_Accession (exp) source_name cell_line quality_scoring_system (run) orf_encoding_protein rip_antibody tissue Developmental_Stage quality_book_char (run) strain genotype Development_stage sex RUN_SEQUENCE_LENGTH (run) gender replicate cell_type BIOMATERIAL_PROVIDER BioRep +SRR031709 RNA-Seq 45 171576405 PRJNA168994 SAMN00006272 103314708 GEO public run.zq,sra gs,ncbi,s3 gs.US,ncbi.public,s3.us-east-1 SRX014458 Illumina Genome Analyzer II S2_DRSC_Untreated-1 SINGLE cDNA TRANSCRIPTOMIC Drosophila melanogaster ILLUMINA 2009-12-16T00:00:00Z GSM461176.- SRP001537 2014-05-26T11:19:00Z 2 GSM461176 S2_DRSC_Untreated S2-DRSC +SRR031714 RNA-Seq 74 394229450 PRJNA168994 SAMN00006273 179603204 GEO public fastq,run.zq,sra gs,ncbi,s3 gs.US,ncbi.public,s3.us-east-1 SRX014459 Illumina Genome Analyzer II S2_DRSC_Untreated-3 PAIRED cDNA TRANSCRIPTOMIC Drosophila melanogaster ILLUMINA 2010-01-15T00:00:00Z GSM461177? SRP001537 2014-05-26T11:19:00Z 2 GSM461177 S2_DRSC_Untreated S2-DRSC log odds @ +SRR031715 RNA-Seq 74 388381304 PRJNA168994 SAMN00006273 180399264 GEO public fastq,run.zq,sra gs,ncbi,s3 gs.US,ncbi.public,s3.us-east-1 SRX014459 Illumina Genome Analyzer II S2_DRSC_Untreated-3 PAIRED cDNA TRANSCRIPTOMIC Drosophila melanogaster ILLUMINA 2010-01-21T00:00:00Z GSM461177? SRP001537 2014-05-26T11:19:00Z 2 GSM461177 S2_DRSC_Untreated S2-DRSC log odds @ +SRR031716 RNA-Seq 74 438206318 PRJNA168994 SAMN00006274 240011134 GEO public fastq,run.zq,sra gs,ncbi,s3 gs.US,ncbi.public,s3.us-east-1 SRX014460 Illumina Genome Analyzer II S2_DRSC_Untreated-4 PAIRED cDNA TRANSCRIPTOMIC Drosophila melanogaster ILLUMINA 2010-01-15T00:00:00Z GSM461178___a SRP001537 2014-05-26T11:20:00Z 2 GSM461178 S2_DRSC_Untreated S2-DRSC log odds @ +SRR031719 RNA-Seq 44 189683032 PRJNA168994 SAMN00006275 120071915 GEO public fastq,run.zq,sra gs,ncbi,s3 gs.US,ncbi.public,s3.us-east-1 SRX014461 Illumina Genome Analyzer II S2_DRSC_CG8144_RNAi-1 SINGLE cDNA TRANSCRIPTOMIC Drosophila melanogaster ILLUMINA 2009-12-16T00:00:00Z GSM461179/ID SRP001537 2014-05-26T11:19:00Z 2 GSM461179 S2_DRSC_CG8144_RNAi S2-DRSC +SRR031723 RNA-Seq 40 182610720 PRJNA168994 SAMN00006275 100216178 GEO public fastq,run.zq,sra gs,ncbi,s3 gs.US,ncbi.public,s3.us-east-1 SRX014461 Illumina Genome Analyzer II S2_DRSC_CG8144_RNAi-1 SINGLE cDNA TRANSCRIPTOMIC Drosophila melanogaster ILLUMINA 2009-12-18T00:00:00Z GSM461179/ID SRP001537 2014-05-26T11:20:00Z 2 GSM461179 S2_DRSC_CG8144_RNAi S2-DRSC log odds @