Source code for peptacular.isotope

from collections import Counter
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Final, cast

from tacular import ELEMENT_LOOKUP, FRAGMENT_ION_LOOKUP, ElementInfo, FragmentIonInfo, IonType, IonTypeLiteral

from . import constants

CARBON = ELEMENT_LOOKUP["C"]
HYDROGEN = ELEMENT_LOOKUP["H"]
NITROGEN = ELEMENT_LOOKUP["N"]
OXYGEN = ELEMENT_LOOKUP["O"]
SULFUR = ELEMENT_LOOKUP["S"]

AVERAGINE_RATIOS: Final[dict[ElementInfo, float]] = {CARBON: 0.044179, HYDROGEN: 0.069749, NITROGEN: 0.012344, OXYGEN: 0.013352, SULFUR: 0.0004}


def _chem_mass(
    formula: Mapping[str, int | float],
    monoisotopic: bool = True,
) -> float:
    m: float = 0.0
    for element, count in formula.items():
        m += ELEMENT_LOOKUP[element].get_mass(monoisotopic=monoisotopic) * count
    return m


[docs] @dataclass(frozen=True, slots=True) class IsotopicData: mass: float neutron_count: int abundance: float
def estimate_averagine_comp(neutral_mass: float, ion_type: str | IonType | IonTypeLiteral = "p") -> Mapping[ElementInfo, float]: """ Estimate elemental composition from molecular mass using the averagine model. .. code-block:: python # Example usage >>> round(estimate_averagine_comp(1000)['C'], 2) 44.18 """ composition: dict[ElementInfo, float] = {element: ratio * neutral_mass for element, ratio in AVERAGINE_RATIOS.items()} ion_info: FragmentIonInfo = FRAGMENT_ION_LOOKUP[ion_type] for elem_info, count in ion_info.composition.items(): if elem_info in composition: composition[elem_info] += count else: composition[elem_info] = count return composition def averagine_comp(neutral_mass: float) -> Counter[ElementInfo]: comp: Mapping[ElementInfo, int | float] = estimate_averagine_comp(neutral_mass) composition: Counter[ElementInfo] = Counter() for element, count in comp.items(): composition[element] = int(round(count)) # pop zeros for elem in list(composition.keys()): if composition[elem] == 0: del composition[elem] return composition def isotopic_distribution( chemical_formula: Mapping[str | ElementInfo, int | float], max_isotopes: int | None = 10, min_abundance_threshold: float = 0.001, # based on the most abundant peak distribution_resolution: int | None = 5, use_neutron_count: bool = False, conv_min_abundance_threshold: float = 10e-15, charge_state: int | None = None, ) -> list[IsotopicData]: """ Calculate the isotopic distribution for a given chemical formula. :param chemical_formula: Chemical formula with element counts. Non-integer counts will be rounded. :type chemical_formula: Dict[str, Union[int, float]] :param max_isotopes: Maximum number of isotopes to track during convolution. Limits memory usage but may affect accuracy. Use `min_abundance_threshold` to control final output. :type max_isotopes: Optional[int] :param min_abundance_threshold: Minimum relative abundance threshold for returned isotopes. :type min_abundance_threshold: float :param distribution_resolution: Decimal places for rounding masses during convolution. Simulates mass spectrometer resolution. :type distribution_resolution: Optional[int] :param use_neutron_count: If True, use neutron offsets instead of absolute masses. :type use_neutron_count: bool :param conv_min_abundance_threshold: Minimum abundance threshold during convolution. Works with `max_isotopes` to limit intermediate isotopes tracked. :type conv_min_abundance_threshold: float :return: Isotopic distribution normalized so the most abundant peak = 1.0. :rtype: List[IsotopicData] .. code-block:: python # Example usage >>> formula = {'C': 12, 'H': 6, 'N': 3} >>> result = isotopic_distribution(formula, 3, 0.0, 5) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(192.06, 1.0), (193.05, 0.01), (193.06, 0.13)] # Example usage with Isotopes Specified, '13C' is assumed to be # static and will not be used in convolution, though mass is corrected >>> formula = {'C': 12, 'H': 6, 'N': 3, '13C': 1} >>> result = isotopic_distribution(formula, 3, 0.0, 5) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(205.06, 1.0), (206.06, 0.01), (206.06, 0.13)] >>> formula = {'C': 12, 'H': 6, 'N': 3, 'Li': 0} >>> result = isotopic_distribution(formula, 3, 0.0, 5) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(192.06, 1.0), (193.05, 0.01), (193.06, 0.13)] >>> formula = {'C': 100, 'H': 100, 'N': 100, 'O': 100} >>> result = isotopic_distribution(formula, 3, 0.0, 5) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(4300.58, 0.92), (4301.58, 1.0), (4302.59, 0.54)] # Example usage >>> formula = {'C': 100, 'H': 150, 'N': 15, 'O': 20} >>> result = isotopic_distribution(formula, 3, 0.0, 5) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(1881.12, 0.92), (1882.12, 1.0), (1883.12, 0.54)] # Example usage with neutron count >>> formula = {'C': 100, 'H': 150, 'N': 15, 'O': 20} >>> result = isotopic_distribution(formula, 3, 0.0, 5, True) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(0.0, 0.86), (1.0, 1.0), (2.0, 0.61)] # Example usage with negative values >>> formula = {'C': 100, 'Li': 10, 'N': 15, 'O': 20} >>> result = isotopic_distribution(formula, 3, 0.0, 5, True) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(-1.0, 0.54), (0.0, 1.0), (1.0, 0.81)] # Example usage with floating values >>> formula = {'C': 100, 'H': 150.5, 'N': 15, 'O': 20} >>> result = isotopic_distribution(formula, 3, 0.0, 5, False) >>> [(round(r.mass, 2), round(r.abundance, 2)) for r in result] [(1881.62, 0.92), (1882.63, 1.0), (1883.63, 0.54)] # Example usage with floating values >>> formula = {'C': -100, 'H': 150.5, 'N': 15, 'O': 20} >>> isotopic_distribution(formula, 3, 0.0, 5, False) Traceback (most recent call last): ValueError: Negative values are not allowed in the chemical formula. """ composition: dict[str, float | int] = {} for key, value in chemical_formula.items(): if isinstance(key, str): composition[key] = value else: composition[str(key)] = value # Just use these to correct mass if charge_state == 0 or charge_state is None: particle_mass_offset = 0.0 else: particle_mass_offset = -charge_state * constants.ELECTRON_MASS # check if any values are negative: if any(v < 0 for v in composition.values()): raise ValueError("Negative values are not allowed in the chemical formula.") # Remove 0 values from the chemical formula for elem in list(composition.keys()): if composition[elem] == 0: del composition[elem] delta_mass = 0.0 if not all(isinstance(v, int) for v in composition.values()): prior_mass = _chem_mass(composition) composition = {k: int(round(v)) for k, v in composition.items()} post_mass = _chem_mass(composition) delta_mass = prior_mass - post_mass total_distribution = {0.0: (1.0, 0)} # Start with a base distribution (mass/offset: (abundance, neutron_count)) for element, count in composition.items(): elemental_distribution = _calculate_elemental_distribution( element, int(count), use_neutron_count, conv_min_abundance_threshold, max_isotopes, ) total_distribution = _convolve_distributions( total_distribution, elemental_distribution, max_isotopes, conv_min_abundance_threshold, distribution_resolution, ) # Normalize abundances before filtering to ensure consistent abundance calculation abundances = [abundance for abundance, _ in total_distribution.values()] if not abundances: return [] max_abundance = max(abundances) normalized_distribution = [ (mass, abundance / max_abundance, neutron_count) for mass, (abundance, neutron_count) in sorted(total_distribution.items()) if abundance / max_abundance >= min_abundance_threshold ] if delta_mass != 0.0 and not use_neutron_count: normalized_distribution = [ (mass + delta_mass + particle_mass_offset, abundance, neutron_count) for mass, abundance, neutron_count in normalized_distribution ] return [IsotopicData(mass=mass, neutron_count=neutron_count, abundance=abundance) for mass, abundance, neutron_count in normalized_distribution] def merge_isotopic_distributions(*distributions: list[IsotopicData], merge_precision: int | None = None) -> list[IsotopicData]: """ Merge multiple isotopic distributions by summing abundances at each mass. :param distributions: Isotopic distributions to merge. :type distributions: List[IsotopicData] :param merge_precision: Decimal places for rounding masses during merge. :type merge_precision: Optional[int] :return: Merged isotopic distribution. :rtype: List[IsotopicData] .. code-block:: python # Example usage >>> d1 = [IsotopicData(1.0, 0, 0.5), IsotopicData(2.0, 1, 0.5)] >>> d2 = [IsotopicData(1.0, 0, 0.5), IsotopicData(2.0, 1, 0.5)] >>> result = merge_isotopic_distributions(d1, d2) >>> [(r.mass, r.abundance) for r in result] [(1.0, 1.0), (2.0, 1.0)] """ merged_distribution: dict[float, tuple[float, int]] = {} for distribution in distributions: for isotope in distribution: mass = isotope.mass if merge_precision is not None: mass = round(mass, merge_precision) # Merge the distributions, keep neutron count from first occurrence if mass in merged_distribution: merged_distribution[mass] = ( merged_distribution[mass][0] + isotope.abundance, merged_distribution[mass][1], # Keep existing neutron count ) else: merged_distribution[mass] = (isotope.abundance, isotope.neutron_count) return [ IsotopicData(mass=mass, neutron_count=neutron_count, abundance=abundance) for mass, (abundance, neutron_count) in sorted(merged_distribution.items(), key=lambda x: x[0]) ] def estimate_isotopic_distribution( neutral_mass: float, max_isotopes: int | None = 10, min_abundance_threshold: float = 0.001, distribution_resolution: int | None = 5, use_neutron_count: bool = False, conv_min_abundance_threshold: float = 1e-15, ) -> list[IsotopicData]: """ Estimate isotopic distribution from molecular mass using the averagine model. :param neutral_mass: Neutral mass of the molecule. :type neutral_mass: float :param max_isotopes: Maximum number of isotopes to track during convolution. :type max_isotopes: int :param min_abundance_threshold: Minimum relative abundance threshold for returned isotopes. :type min_abundance_threshold: float :param distribution_resolution: Decimal places for rounding masses during convolution. :type distribution_resolution: Optional[int] :param use_neutron_count: If True, use neutron offsets instead of absolute masses. :type use_neutron_count: bool :param conv_min_abundance_threshold: Minimum abundance threshold during convolution. :type conv_min_abundance_threshold: float :return: Predicted isotopic distribution normalized so the most abundant peak = 1.0. :rtype: List[IsotopicData] .. code-block:: python # Example usage >>> result = estimate_isotopic_distribution(800, 3, 0.0, 5) >>> [(round(r.mass, 3), round(r.abundance, 3)) for r in result] [(810.424, 1.0), (811.427, 0.379), (812.43, 0.07)] # Example usage with neutron count >>> result = estimate_isotopic_distribution(800, 3, 0.0, 5, True) >>> [(round(r.mass, 3), round(r.abundance, 3)) for r in result] [(0.0, 1.0), (1.0, 0.426), (2.0, 0.113)] """ # Calculate the total number of each atom in the molecule based on its molecular mass total_atoms: Counter[ElementInfo] = averagine_comp(neutral_mass) return isotopic_distribution( cast(Mapping[str | ElementInfo, int | float], total_atoms), max_isotopes, min_abundance_threshold, distribution_resolution, use_neutron_count, conv_min_abundance_threshold, ) def _convolve_distributions( dist1: dict[float, tuple[float, int]], dist2: dict[float, tuple[float, int]], max_isotopes: int | None, min_abundance_threshold: float, distribution_resolution: int | None, ) -> dict[float, tuple[float, int]]: """ Convolve two distributions to calculate the distribution of their sum. :param dist1: First distribution mapping mass/offset to (abundance, neutron_count). :type dist1: Dict[float, Tuple[float, int]] :param dist2: Second distribution mapping mass/offset to (abundance, neutron_count). :type dist2: Dict[float, Tuple[float, int]] :param max_isotopes: Maximum number of isotopes to retain based on abundance. :type max_isotopes: Optional[int] :param min_abundance_threshold: Minimum abundance threshold for keeping isotopes. :type min_abundance_threshold: float :param distribution_resolution: Decimal places for rounding masses. :type distribution_resolution: Optional[int] :return: Convolved isotopic distribution as mass-to-(abundance, neutron_count) mapping. :rtype: Dict[float, Tuple[float, int]] .. code-block:: python # Example usage >>> d1 = {1.0: (0.5, 0), 2.0: (0.5, 1)} >>> d2 = {1.0: (0.5, 0), 2.0: (0.5, 1)} >>> result = _convolve_distributions(d1, d2, 3, 0.0, 5) >>> sorted(result.items()) [(2.0, (0.25, 0)), (3.0, (0.5, 1)), (4.0, (0.25, 2))] """ result: dict[float, tuple[float, int]] = {} for mass1, (abundance1, neutron1) in dist1.items(): for mass2, (abundance2, neutron2) in dist2.items(): new_abundance = abundance1 * abundance2 if new_abundance < min_abundance_threshold: break new_mass = mass1 + mass2 if distribution_resolution is not None: new_mass = round(new_mass, distribution_resolution) new_neutron_count = neutron1 + neutron2 if new_mass in result: result[new_mass] = ( result[new_mass][0] + new_abundance, new_neutron_count, ) else: result[new_mass] = (new_abundance, new_neutron_count) # Apply max isotopes limit and sort by abundance sorted_result = sorted(result.items(), key=lambda x: x[1][0], reverse=True) if max_isotopes is not None: # Retain only the top `max_isotopes` isotopes based on abundance return dict(sorted_result[:max_isotopes]) return result def _calculate_elemental_distribution_slow( element: str, count: int, use_neutron_count: bool, min_abundance_threshold: float = 10e-9, ) -> dict[float, tuple[float, int]]: """ Calculate the isotopic distribution for an element. :param element: The element. :type element: str :param count: The number of atoms. :type count: int :param use_neutron_count: Whether to use neutron offsets instead of masses. :type use_neutron_count: bool :return: The isotopic distribution mapping mass/offset to (abundance, neutron_count). :rtype: Dict[float, Tuple[float, int]] .. code-block:: python # Example usage >>> _calculate_elemental_distribution_slow('C', 2, False) {24.0: (0.9787144899999999, 0), 25.00335483507: (0.02117102, 1), 26.00670967014: (0.00011448999999999998, 2)} # Example using neutron count >>> _calculate_elemental_distribution_slow('C', 2, True) {0.0: (0.9787144899999999, 0), 1.0: (0.02117102, 1), 2.0: (0.00011448999999999998, 2)} """ if use_neutron_count is True: isotopes = ELEMENT_LOOKUP.get_neutron_offsets_and_abundances(element) else: isotopes = ELEMENT_LOOKUP.get_masses_and_abundances(element) # Start with a distribution for an element not present (mass=0, abundance=1, neutron_count=0) distribution = {0.0: (1.0, 0)} for _ in range(count): # Update the distribution by convolving it with the isotopes' distribution each time # Convert isotopes list to dict with neutron count tracking isotope_distribution: dict[float, tuple[float, int]] = {} for i, (mass_or_offset, abundance) in enumerate(isotopes): # When use_neutron_count=True, mass_or_offset is already the neutron offset (int) # When use_neutron_count=False, mass_or_offset is mass, and we track neutron by index neutron_count = int(mass_or_offset) if use_neutron_count else i isotope_distribution[mass_or_offset] = (abundance, neutron_count) distribution = _convolve_distributions(distribution, isotope_distribution, None, min_abundance_threshold, None) return distribution def _calculate_elemental_distribution( element: str | ElementInfo, count: int, use_neutron_count: bool, min_abundance_threshold: float = 10e-15, max_isotopes: int | None = None, ) -> dict[float, tuple[float, int]]: """Calculate elemental isotopic distribution using binary exponentiation for efficiency.""" if not isinstance(element, ElementInfo): elem_info = ELEMENT_LOOKUP[element] else: elem_info = element if elem_info.mass_number is not None: # Monoisotopic elements have only one isotope if use_neutron_count: return {0.0: (1.0, 0)} else: mass = elem_info.get_mass(monoisotopic=True) return {mass: (1.0, 0)} if use_neutron_count: isotopes = ELEMENT_LOOKUP.get_neutron_offsets_and_abundances(element) else: isotopes = ELEMENT_LOOKUP.get_masses_and_abundances(element) # Build base distribution isotope_distribution = {} for i, (mass_or_offset, abundance) in enumerate(isotopes): neutron_count = int(mass_or_offset) if use_neutron_count else i isotope_distribution[mass_or_offset] = (abundance, neutron_count) # Fast exponentiation by squaring: compute isotope_distribution^count result = {0.0: (1.0, 0)} base = isotope_distribution while count > 0: if count % 2 == 1: result = _convolve_distributions(result, base, max_isotopes, min_abundance_threshold, None) base = _convolve_distributions(base, base, max_isotopes, min_abundance_threshold, None) count //= 2 return result
[docs] class IsotopeLookup: def __init__( self, mass_step: int = 50, max_isotopes: int = 25, min_abundance_threshold: float = 0.005, use_neutron_count: bool = True, is_abundance_sum: bool = True, ): self.mass_step = mass_step self.max_isotopes = max_isotopes self.min_abundance_threshold = min_abundance_threshold self.use_neutron_count = use_neutron_count self.is_abundance_sum = is_abundance_sum self.lookup: dict[int, list[IsotopicData]] = {} def _get_mass_bin(self, mass: float) -> int: """Calculate the mass bin for a given mass.""" return round(mass / self.mass_step) * self.mass_step def _generate_pattern(self, mass_bin: int) -> list[IsotopicData]: """Generate isotope pattern for a specific mass bin.""" iso_pattern: list[IsotopicData] = estimate_isotopic_distribution( neutral_mass=mass_bin, max_isotopes=self.max_isotopes, min_abundance_threshold=self.min_abundance_threshold, use_neutron_count=self.use_neutron_count, ) return iso_pattern
[docs] def get_isotope_pattern(self, mass: float) -> list[IsotopicData]: """ Get the isotope pattern for a given mass. Generates and caches the pattern if not already present. """ mass_bin = self._get_mass_bin(mass) # Check if pattern exists, if not generate it if mass_bin not in self.lookup: self.lookup[mass_bin] = self._generate_pattern(mass_bin) return self.lookup[mass_bin]
[docs] def clear_cache(self): """Clear the cached isotope patterns.""" self.lookup.clear()
[docs] def get_cache_size(self) -> int: """Return the number of cached isotope patterns.""" return len(self.lookup)