src.co_tools.co_fastq

  1import os
  2import re
  3from glob import glob
  4from pathlib import Path
  5
  6if os.getenv("CO_LOG").lower() == "true":
  7    from .get_logger import LOGGER
  8
  9    log = LOGGER
 10else:
 11    import logging
 12
 13    log = logging.getLogger(__name__)
 14
 15
 16def get_fastq_pair(dir_path: str = "../data"):
 17    """This function returns a pair of paired-end reads files
 18
 19    Args:
 20        dir_path (str, optional): The folder where all the reads files are.
 21        Defaults to "../data".
 22
 23    Returns:
 24        str: comma-separated pair of reads files as path to files
 25    """
 26    total_dirs = 0
 27    for base, dirs, files in os.walk(dir_path):
 28        for dir in dirs:
 29            total_dirs += 1
 30    if total_dirs != 2:
 31        log.error(
 32            f"The fastq files in {dir_path} are not properly configured"
 33            + " to use this function. There should be only 2 folders"
 34            + " inside the data folder."
 35        )
 36        return 0
 37    prefix_dict, this_prefix = {}, None
 38    fwd, rev = None, None
 39    for path in glob(str(f"{dir_path}/**/*.fastq.gz"), recursive=True):
 40        if prefix := get_prefix(path):
 41            if prefix in prefix_dict:
 42                prefix_dict[prefix].append(path)
 43                if len(prefix_dict[prefix]) == 3:
 44                    log.info(f"prefix {prefix} occurs 3 times in the {dir_path} folder")
 45                    this_prefix = prefix
 46                    break
 47            else:
 48                prefix_dict[prefix] = [path]
 49        else:
 50            log.warning(f"No prefix determined for {path}")
 51    if not prefix_dict:
 52        log.warning(f"No files found in {dir_path}")
 53        return 0
 54    if not this_prefix:
 55        log.warning(f"fastq files in {dir_path} not properly organized")
 56        return 0
 57    for path in prefix_dict[this_prefix]:
 58        if get_read_direction(path) == "1":
 59            fwd = path
 60        elif get_read_direction(path) == "2":
 61            rev = path
 62    if fwd and rev:
 63        log.info(f"returning {fwd},{rev}")
 64        return f"{fwd},{rev}"
 65    else:
 66        log.warning(f"Could not find complementary pair of fastq files in {dir_path}")
 67        return 0
 68
 69
 70def get_fwd_fastqs(dir: str = "../data"):
 71    """Returns all the forward reads files in ascending alphabetical order
 72
 73    Args:
 74        dir (str, optional): The folder where all the reads file are.
 75        Defaults to "../data".
 76
 77    Returns:
 78        str: newline-separated string of forward reads files
 79    """
 80    if fastq_files := glob(str(f"{dir}/**/*.fastq.gz"), recursive=True):
 81        log.debug(
 82            f"Found the following fastq files in the {dir} folder:\n{fastq_files}"
 83        )
 84        pattern = get_read_pattern(fastq_files[0])
 85        fwd_fastqs_list = glob(str(f"{dir}/**/*{pattern}"), recursive=True)
 86        fwd_fastqs_list.sort()
 87        fwd_fastqs = "\n".join(fwd_fastqs_list)
 88        log.debug(f"Returning the following fwd fastq files\n{fwd_fastqs}")
 89        return fwd_fastqs
 90    else:
 91        log.error(f"There are no fastq.gz files in the {dir} directory")
 92        return 0
 93
 94
 95def get_read_direction(filepath: str):
 96    """This function returns the direction of a single paired-end reads file
 97
 98    Args:
 99        filepath (str): The path to the reads file you need the direction of
100
101    Returns:
102        str: Returns 1 if file is detected as forward, 2 otherwise
103    """
104    filename = Path(filepath).name
105    log.debug(f"filename: {filename}")
106    if "_" not in filename:
107        log.warning(
108            "You might be trying to use a single end reads file as a paired"
109            + f" end reads file. Current input: {filepath}"
110        )
111        return 0
112    return "1" if "1" in filename.split("_")[-1].split(".")[0] else "2"
113
114
115def get_read_pattern(filename: str, direction: str = "1"):
116    """This function returns the pattern shared for half the paired-end reads files
117
118    Args:
119        filename (str): Name of file to determine pattern from
120        direction (str, optional): The direction you need the pattern for.
121        Defaults to "1". Accepts "1" for forward or "2" for reverse
122
123    Returns:
124        str: The pattern for all the forward or reverse paired-end reads file
125        corresponding to the direction you specified in 'direction'
126    """
127    if "_" not in filename and "/" in filename:
128        log.warning(
129            f"{filename} might be a single end reads file. The pattern being returned"
130            + " is the entire filename"
131        )
132        return Path(filename).name
133    direction_complement = "2" if direction == "1" else "1"
134    pattern = filename.split("_")[-1]
135    log.debug(f"pattern: {pattern}")
136    return (
137        pattern
138        if direction in pattern
139        else pattern.replace(direction_complement, direction)
140    )
141
142
143def get_prefix(filename: str, split_position: str = "-1"):
144    """This function returns the prefix that is unique to (1) pair of paired-end files
145
146    Args:
147        filename (str): The name of the file to determine prefix from
148        split_position (str, optional): If underscores are in the filename and user
149        just needs to trim the filename after a certain underscore, then
150        this arg specifies where to trim e.g.
151        get_prefix("GSM1234_sample12_exp.fastq.gz", -1) returns "GSM1234_sample12").
152        Defaults to "-1".
153
154    Returns:
155        str: Returns the prefix that is unique to a single pair of paired-end
156        reads files.
157    """
158    filename = Path(filename).name
159    # illumina read files use a specific format, and sometimes allow underscores in the prefix
160    # SampleName_S1_L001_R1_001.fastq.gz for lane 1
161    # SampleName_S1_R1_001.fastq.gz for merged lanes.
162
163    if match := re.search(r"(.*?)_S\d+_.*R\d_001.fastq.gz", filename):
164        log.debug(f"match: {match}\ngroup 1 (prefix): {match.group(1)}")
165        return match.group(1)
166
167    if "_" in filename and int(split_position):
168        prefix_list = filename.split("_")[: int(split_position)]
169        log.debug(f"prefix_list: {prefix_list}")
170        return "_".join(prefix_list)
171
172    log.warning(f"A prefix was not able to be determined for {filename}")
173    return 0
174
175
176def get_rev_file(
177    fwd_file: str, name_only=False, pattern_fwd: bool = False, pattern_rev: bool = False
178):
179    """_summary_
180
181    Args:
182        fwd_file (str): The forward file you want to find the reverse
183        file for.
184        name_only (bool, optional): Set to True if you want this function to
185        return only the filename. Defaults to False.
186        pattern_fwd (bool, optional): Specify the pattern to replace.
187        Defaults to False.
188        pattern_rev (bool, optional): Specify the replacement pattern.
189        Defaults to False.
190
191    Returns:
192        str: The reverse reads file
193    """
194    if name_only:
195        name_only = True if "true" in str(name_only).lower() else False
196    if not pattern_fwd:
197        pattern_fwd = get_read_pattern(fwd_file, "1")
198        log.debug(f"Autodetected forward pattern: {pattern_fwd}")
199    if not pattern_rev:
200        pattern_rev = get_read_pattern(fwd_file, "2")
201        log.debug(f"Autodetected reverse pattern: {pattern_rev}")
202    log.debug(
203        f"fwd_file: {fwd_file}\nWill replace {pattern_fwd}" + f" with {pattern_rev}"
204    )
205    return (
206        Path(
207            fwd_file.replace(
208                pattern_fwd,
209                pattern_rev,
210            )
211        ).name
212        if name_only
213        else fwd_file.replace(
214            pattern_fwd,
215            pattern_rev,
216        )
217    )
def get_fastq_pair(dir_path: str = '../data'):
17def get_fastq_pair(dir_path: str = "../data"):
18    """This function returns a pair of paired-end reads files
19
20    Args:
21        dir_path (str, optional): The folder where all the reads files are.
22        Defaults to "../data".
23
24    Returns:
25        str: comma-separated pair of reads files as path to files
26    """
27    total_dirs = 0
28    for base, dirs, files in os.walk(dir_path):
29        for dir in dirs:
30            total_dirs += 1
31    if total_dirs != 2:
32        log.error(
33            f"The fastq files in {dir_path} are not properly configured"
34            + " to use this function. There should be only 2 folders"
35            + " inside the data folder."
36        )
37        return 0
38    prefix_dict, this_prefix = {}, None
39    fwd, rev = None, None
40    for path in glob(str(f"{dir_path}/**/*.fastq.gz"), recursive=True):
41        if prefix := get_prefix(path):
42            if prefix in prefix_dict:
43                prefix_dict[prefix].append(path)
44                if len(prefix_dict[prefix]) == 3:
45                    log.info(f"prefix {prefix} occurs 3 times in the {dir_path} folder")
46                    this_prefix = prefix
47                    break
48            else:
49                prefix_dict[prefix] = [path]
50        else:
51            log.warning(f"No prefix determined for {path}")
52    if not prefix_dict:
53        log.warning(f"No files found in {dir_path}")
54        return 0
55    if not this_prefix:
56        log.warning(f"fastq files in {dir_path} not properly organized")
57        return 0
58    for path in prefix_dict[this_prefix]:
59        if get_read_direction(path) == "1":
60            fwd = path
61        elif get_read_direction(path) == "2":
62            rev = path
63    if fwd and rev:
64        log.info(f"returning {fwd},{rev}")
65        return f"{fwd},{rev}"
66    else:
67        log.warning(f"Could not find complementary pair of fastq files in {dir_path}")
68        return 0

This function returns a pair of paired-end reads files

Args: dir_path (str, optional): The folder where all the reads files are. Defaults to "../data".

Returns: str: comma-separated pair of reads files as path to files

def get_fwd_fastqs(dir: str = '../data'):
71def get_fwd_fastqs(dir: str = "../data"):
72    """Returns all the forward reads files in ascending alphabetical order
73
74    Args:
75        dir (str, optional): The folder where all the reads file are.
76        Defaults to "../data".
77
78    Returns:
79        str: newline-separated string of forward reads files
80    """
81    if fastq_files := glob(str(f"{dir}/**/*.fastq.gz"), recursive=True):
82        log.debug(
83            f"Found the following fastq files in the {dir} folder:\n{fastq_files}"
84        )
85        pattern = get_read_pattern(fastq_files[0])
86        fwd_fastqs_list = glob(str(f"{dir}/**/*{pattern}"), recursive=True)
87        fwd_fastqs_list.sort()
88        fwd_fastqs = "\n".join(fwd_fastqs_list)
89        log.debug(f"Returning the following fwd fastq files\n{fwd_fastqs}")
90        return fwd_fastqs
91    else:
92        log.error(f"There are no fastq.gz files in the {dir} directory")
93        return 0

Returns all the forward reads files in ascending alphabetical order

Args: dir (str, optional): The folder where all the reads file are. Defaults to "../data".

Returns: str: newline-separated string of forward reads files

def get_read_direction(filepath: str):
 96def get_read_direction(filepath: str):
 97    """This function returns the direction of a single paired-end reads file
 98
 99    Args:
100        filepath (str): The path to the reads file you need the direction of
101
102    Returns:
103        str: Returns 1 if file is detected as forward, 2 otherwise
104    """
105    filename = Path(filepath).name
106    log.debug(f"filename: {filename}")
107    if "_" not in filename:
108        log.warning(
109            "You might be trying to use a single end reads file as a paired"
110            + f" end reads file. Current input: {filepath}"
111        )
112        return 0
113    return "1" if "1" in filename.split("_")[-1].split(".")[0] else "2"

This function returns the direction of a single paired-end reads file

Args: filepath (str): The path to the reads file you need the direction of

Returns: str: Returns 1 if file is detected as forward, 2 otherwise

def get_read_pattern(filename: str, direction: str = '1'):
116def get_read_pattern(filename: str, direction: str = "1"):
117    """This function returns the pattern shared for half the paired-end reads files
118
119    Args:
120        filename (str): Name of file to determine pattern from
121        direction (str, optional): The direction you need the pattern for.
122        Defaults to "1". Accepts "1" for forward or "2" for reverse
123
124    Returns:
125        str: The pattern for all the forward or reverse paired-end reads file
126        corresponding to the direction you specified in 'direction'
127    """
128    if "_" not in filename and "/" in filename:
129        log.warning(
130            f"{filename} might be a single end reads file. The pattern being returned"
131            + " is the entire filename"
132        )
133        return Path(filename).name
134    direction_complement = "2" if direction == "1" else "1"
135    pattern = filename.split("_")[-1]
136    log.debug(f"pattern: {pattern}")
137    return (
138        pattern
139        if direction in pattern
140        else pattern.replace(direction_complement, direction)
141    )

This function returns the pattern shared for half the paired-end reads files

Args: filename (str): Name of file to determine pattern from direction (str, optional): The direction you need the pattern for. Defaults to "1". Accepts "1" for forward or "2" for reverse

Returns: str: The pattern for all the forward or reverse paired-end reads file corresponding to the direction you specified in 'direction'

def get_prefix(filename: str, split_position: str = '-1'):
144def get_prefix(filename: str, split_position: str = "-1"):
145    """This function returns the prefix that is unique to (1) pair of paired-end files
146
147    Args:
148        filename (str): The name of the file to determine prefix from
149        split_position (str, optional): If underscores are in the filename and user
150        just needs to trim the filename after a certain underscore, then
151        this arg specifies where to trim e.g.
152        get_prefix("GSM1234_sample12_exp.fastq.gz", -1) returns "GSM1234_sample12").
153        Defaults to "-1".
154
155    Returns:
156        str: Returns the prefix that is unique to a single pair of paired-end
157        reads files.
158    """
159    filename = Path(filename).name
160    # illumina read files use a specific format, and sometimes allow underscores in the prefix
161    # SampleName_S1_L001_R1_001.fastq.gz for lane 1
162    # SampleName_S1_R1_001.fastq.gz for merged lanes.
163
164    if match := re.search(r"(.*?)_S\d+_.*R\d_001.fastq.gz", filename):
165        log.debug(f"match: {match}\ngroup 1 (prefix): {match.group(1)}")
166        return match.group(1)
167
168    if "_" in filename and int(split_position):
169        prefix_list = filename.split("_")[: int(split_position)]
170        log.debug(f"prefix_list: {prefix_list}")
171        return "_".join(prefix_list)
172
173    log.warning(f"A prefix was not able to be determined for {filename}")
174    return 0

This function returns the prefix that is unique to (1) pair of paired-end files

Args: filename (str): The name of the file to determine prefix from split_position (str, optional): If underscores are in the filename and user just needs to trim the filename after a certain underscore, then this arg specifies where to trim e.g. get_prefix("GSM1234_sample12_exp.fastq.gz", -1) returns "GSM1234_sample12"). Defaults to "-1".

Returns: str: Returns the prefix that is unique to a single pair of paired-end reads files.

def get_rev_file( fwd_file: str, name_only=False, pattern_fwd: bool = False, pattern_rev: bool = False):
177def get_rev_file(
178    fwd_file: str, name_only=False, pattern_fwd: bool = False, pattern_rev: bool = False
179):
180    """_summary_
181
182    Args:
183        fwd_file (str): The forward file you want to find the reverse
184        file for.
185        name_only (bool, optional): Set to True if you want this function to
186        return only the filename. Defaults to False.
187        pattern_fwd (bool, optional): Specify the pattern to replace.
188        Defaults to False.
189        pattern_rev (bool, optional): Specify the replacement pattern.
190        Defaults to False.
191
192    Returns:
193        str: The reverse reads file
194    """
195    if name_only:
196        name_only = True if "true" in str(name_only).lower() else False
197    if not pattern_fwd:
198        pattern_fwd = get_read_pattern(fwd_file, "1")
199        log.debug(f"Autodetected forward pattern: {pattern_fwd}")
200    if not pattern_rev:
201        pattern_rev = get_read_pattern(fwd_file, "2")
202        log.debug(f"Autodetected reverse pattern: {pattern_rev}")
203    log.debug(
204        f"fwd_file: {fwd_file}\nWill replace {pattern_fwd}" + f" with {pattern_rev}"
205    )
206    return (
207        Path(
208            fwd_file.replace(
209                pattern_fwd,
210                pattern_rev,
211            )
212        ).name
213        if name_only
214        else fwd_file.replace(
215            pattern_fwd,
216            pattern_rev,
217        )
218    )

_summary_

Args: fwd_file (str): The forward file you want to find the reverse file for. name_only (bool, optional): Set to True if you want this function to return only the filename. Defaults to False. pattern_fwd (bool, optional): Specify the pattern to replace. Defaults to False. pattern_rev (bool, optional): Specify the replacement pattern. Defaults to False.

Returns: str: The reverse reads file