'''
mzML is a standard rich XML-format for raw mass spectrometry data storage.
Please refer to `psidev.info <http://www.psidev.info/index.php?q=node/257>`_
for the detailed specification of the format and structure of mzML files.
In addition to mzML, there is a wrapping format called ``indexedmzML``
which adds an extra layer to the XML document, including pre-computed byte offsets
for each ``<spectrum>`` and ``<chromatogram>`` element.
To write ``mzML`` without an index use :class:`PlainMzMLWriter`, and for ``indexedmzML``
use :class:`IndexedMzMLWriter`. Because so many tools rely on the index, :class:`IndexedMzMLWriter`
is exported under the alias `MzMLWriter`. The interface for these two classes are the same,
with :class:`IndexedMzMLWriter` having slightly more complex behavior on writing and when finishing
the document, though you are able to alter the indexing behavior via :attr:`IndexedMzMLWriter.index_builder`
or through inheritance.
'''
import numbers
import warnings
from typing import List, Optional, Dict, Tuple, Mapping, Iterable, TypedDict, Union, DefaultDict
from collections import defaultdict
import numpy as np
from psims.xml import CVParam, XMLWriterMixin, XMLDocumentWriter
from psims.controlled_vocabulary import Entity
from psims.utils import TableStateMachine
from .components import (
ComponentDispatcher, FileDescription, Software, Spectrum, element,
default_cv_list, MzML, InstrumentConfiguration, IndexedMzML)
from .binary_encoding import (
encode_array, COMPRESSION_ZLIB,
encoding_map, compression_map, dtype_to_encoding)
from .utils import ensure_iterable
from .index import IndexingStream
from .native_id import NativeIDParser
from .element_builder import ElementBuilder, ParamManagingProperty
MZ_ARRAY = 'm/z array'
INTENSITY_ARRAY = 'intensity array'
DEFAULT_INTENSITY_UNIT = "number of detector counts"
CHARGE_ARRAY = 'charge array'
TIME_ARRAY = "time array"
DEFAULT_TIME_UNIT = "minute"
NON_STANDARD_ARRAY = 'non-standard data array'
ARRAY_TYPES = [
"ion mobility array",
"temperature array",
"pressure array",
"flow rate array",
"time array",
"resolution array",
"baseline array",
"mean charge array",
"signal to noise array",
"charge array",
"intensity array",
"m/z array",
"scanning quadrupole position upper bound m/z array",
"scanning quadrupole position lower bound m/z array",
"noise array",
"wavelength array",
"sampled noise m/z array",
"sampled noise intensity array",
"sampled noise baseline array",
"mass array",
"raw inverse reduced ion mobility array",
"mean inverse reduced ion mobility array",
"raw ion mobility array",
"mean ion mobility array",
"mean ion mobility drift time array",
"raw ion mobility drift time array",
"deconvoluted ion mobility drift time array",
"deconvoluted ion mobility array",
"deconvoluted inverse reduced ion mobility array"
]
class ArrayTypeSpec(TypedDict):
name: str
unit: Union[Mapping[str, str], CVParam]
ArrayType = Union[str, ArrayTypeSpec]
class DocumentSection(ComponentDispatcher, XMLWriterMixin):
def __init__(self, section, writer, parent_context, section_args=None, **kwargs):
if section_args is None:
section_args = dict()
section_args.update(kwargs)
super(DocumentSection, self).__init__(parent_context)
self.section = section
self.writer = writer
self.section_args = section_args
@property
def native_id_format(self):
return self.writer.native_id_format
def __enter__(self):
return self.begin()
def __exit__(self, exc_type, exc_value, traceback):
self.end(exc_type, exc_value, traceback)
def begin(self):
self.toplevel = element(self.writer, self.section, **self.section_args)
self.toplevel.__enter__()
return self
def end(self, exc_type=None, exc_value=None, traceback=None):
self.toplevel.__exit__(exc_type, exc_value, traceback)
self.writer.flush()
class SpectrumListSection(DocumentSection):
def __init__(self, writer, parent_context, section_args=None, **kwargs):
super(SpectrumListSection, self).__init__(
"spectrumList", writer, parent_context, section_args=section_args,
**kwargs)
self.section_args.setdefault("count", 0)
data_processing_method = self.section_args.pop(
"data_processing_method", None)
try:
self.section_args["defaultDataProcessingRef"] = self.context[
"DataProcessing"][data_processing_method]
except KeyError:
try:
self.section_args["defaultDataProcessingRef"] = list(
self.context["DataProcessing"].values())[0]
except IndexError:
warnings.warn(
"No Data Processing method found. mzML file may not be fully standard-compliant",
stacklevel=2)
class ChromatogramListSection(DocumentSection):
def __init__(self, writer, parent_context, section_args=None, **kwargs):
super(ChromatogramListSection, self).__init__(
"chromatogramList", writer, parent_context,
section_args=section_args, **kwargs)
self.section_args.setdefault("count", 0)
data_processing_method = self.section_args.pop(
"data_processing_method", None)
try:
self.section_args["defaultDataProcessingRef"] = self.context[
"DataProcessing"][data_processing_method]
except KeyError:
try:
self.section_args["defaultDataProcessingRef"] = list(
self.context["DataProcessing"].values())[0]
except IndexError:
warnings.warn(
"No Data Processing method found. mzML file may not be fully standard-compliant",
stacklevel=2)
class RunSection(DocumentSection):
"""Describes a `<run>` tag. Implemented as a section to provide a more
expressive API
"""
def __init__(self, writer, parent_context, section_args=None, **kwargs):
super(RunSection, self).__init__(
"run", writer, parent_context, section_args=section_args, **kwargs)
instrument_configuration_name = self.section_args.pop(
"instrument_configuration", None)
if instrument_configuration_name is not None:
self.section_args["defaultInstrumentConfigurationRef"] = self.context[
"InstrumentConfiguration"][instrument_configuration_name]
source_file_name = self.section_args.pop("source_file", None)
if source_file_name is not None:
self.section_args["defaultSourceFileRef"] = self.context[
"SourceFile"][source_file_name]
sample_id = self.section_args.pop("sample", None)
if sample_id is not None:
self.section_args["sampleRef"] = self.context['Sample'][sample_id]
class IndexedmzMLSection(DocumentSection):
def __init__(self, writer, parent_context, indexer, section_args=None, **kwargs):
super(IndexedmzMLSection, self).__init__(
'indexedmzML', writer, parent_context, section_args=section_args,
**kwargs)
self.toplevel = None
self.inner = None
self.indexer = indexer
def begin(self):
self.toplevel = element(self.writer, IndexedMzML())
self.toplevel.__enter__()
self.inner = element(self.writer, MzML(**self.section_args))
self.inner.__enter__()
def end(self, exc_type=None, exc_value=None, traceback=None):
self.inner.__exit__(exc_type, exc_value, traceback)
self.writer.flush()
self.write_index()
self.toplevel.__exit__(exc_type, exc_value, traceback)
def write_index(self):
self.indexer.to_xml(self)
class PlainMzMLWriter(ComponentDispatcher, XMLDocumentWriter):
"""A high level API for generating mzML XML files from simple Python objects.
This class depends heavily on lxml's incremental file writing API which in turn
depends heavily on context managers. Almost all logic is handled inside a context
manager and in the context of a particular document. Since all operations assume
that they have access to a universal identity map for each element in the document,
that map is centralized in this class.
MzMLWriter inherits from :class:`~psims.mzml.components.ComponentDispatcher`, giving it a :attr:`context`
attribute and access to all `Component` objects pre-bound to that context with attribute-access
notation.
Attributes
----------
chromatogram_count : int
A count of the number of chromatograms written
spectrum_count : int
A count of the number of spectra written
"""
DEFAULT_TIME_UNIT = DEFAULT_TIME_UNIT
DEFAULT_INTENSITY_UNIT = DEFAULT_INTENSITY_UNIT
def __init__(self, outfile, close=None, vocabularies=None, missing_reference_is_error=False,
vocabulary_resolver=None, id=None, accession=None, native_id_format: str=None,
**kwargs):
if vocabularies is None:
vocabularies = []
if native_id_format is None:
native_id_format = "multiple peak list nativeID format"
vocabularies = list(default_cv_list) + list(vocabularies)
ComponentDispatcher.__init__(
self,
vocabularies=vocabularies,
vocabulary_resolver=vocabulary_resolver,
missing_reference_is_error=missing_reference_is_error)
XMLDocumentWriter.__init__(self, outfile, close, **kwargs)
self.id = id
self.accession = accession
self.spectrum_count = 0
self.chromatogram_count = 0
self.default_instrument_configuration = None
self.state_machine = TableStateMachine([
("start", ['controlled_vocabularies', ]),
("controlled_vocabularies", ['file_description', ]),
("file_description", ['reference_param_group_list', 'sample_list', 'software_list']),
("reference_param_group_list", ['sample_list', 'software_list']),
("sample_list", ['software_list', ]),
("software_list", ["scan_settings_list", 'instrument_configuration_list']),
("scan_settings_list", ['instrument_configuration_list', ]),
("instrument_configuration_list", ['data_processing_list']),
("data_processing_list", ['run']),
("run", ['spectrum_list', 'chromatogram_list']),
('spectrum_list', ['chromatogram_list']),
('chromatogram_list', [])
])
self._native_id_format = self._find_native_id_parser(native_id_format)
self.native_id_format_configured = False
self.add_context_key('native_id_formatter', self._native_id_maker)
@property
def native_id_format(self):
'''The nativeID format of the spectra to assume for this data file.
This is used to determine how to convert an integer into a spectrum's :attr:`~.Spectrum.id`.
Defaults to ``MS:1000774``: "multiple peak list nativeID format" which has a pattern of
``index=<number>``.
This attribute has no effect on spectrum id values specified as strings already formatted.
.. note::
If not explicitly specified, but a term naming an ID format is passed as a parameter in
file contents, that will be used. The ID format from source files will **not** be used.
Returns
-------
:class:`~.NativeIDParser`
'''
return self._native_id_format
@native_id_format.setter
def native_id_format(self, native_id_format: str):
'''Set the nativeID format to use for this file.
You can specify the name the ID format using either the format's name
or its CV ID, e.g. "MS:1000774", or a :class:`~psims.controlled_vocabulary.entity.Entity`
describing the same term.
Explicitly setting this attribute will *prevent* automatically using
the nativeID format specified in ``<fileContent>``.
Parameters
----------
native_id_format : :class:`~.Union`[str, :class:`~psims.controlled_vocabulary.entity.Entity`]
The nativeID format to use for this file.
'''
self._native_id_format = self._find_native_id_parser(native_id_format)
self.native_id_format_configured = True
def _find_native_id_parser(self, name: Union[str, Entity]) -> NativeIDParser:
if not isinstance(name, Entity):
term = self.term(name)
return NativeIDParser.from_term(term)
def _native_id_maker(self, _tag_name, number):
return self._native_id_format.format_integer(number)
def toplevel_tag(self):
return MzML(id=self.id, accession=self.accession)
def controlled_vocabularies(self):
"""Write out the `<cvList>` element and all its children,
including both this format's default controlled vocabularies
and those passed as arguments to this method.this
This method requires writing to have begun.
"""
self.state_machine.transition("controlled_vocabularies")
super(PlainMzMLWriter, self).controlled_vocabularies()
def software_list(self, software_list: Iterable[Union[Software, Mapping]]):
"""Writes the ``<softwareList>`` section of the document.
.. note::
List and descriptions of software used to acquire and/or process the
data in this mzML file
Parameters
----------
software_list : list
A list or other iterable of :class:`dict` or :class:`~.Software`-like objects
"""
self.state_machine.transition("software_list")
n = len(software_list)
if n:
software_list = [self.Software.ensure(sw) for sw in ensure_iterable(software_list)]
self.SoftwareList(software_list).write(self)
def file_description(self, file_contents=None, source_files=None, contacts=None):
r"""Writes the ``<fileDescription>`` section of the document.
If ``file_contents`` contains a nativeID term, and :attr:`native_id_format` has
not been set explicitly, that ID format will be used for this document.
.. note::
Information pertaining to the entire mzML file (i.e. not specific
to any part of the data set) is stored here.
Parameters
----------
file_contents : list, optional
A list or other iterable of :class:`str`, :class:`dict`, or \*Param-types which will
be placed in the ``<fileContent>`` element.
source_files : list
A list or other iterable of dict or :class:`~psims.mzml.components.SourceFile`-like objects
to be placed in the ``<sourceFileList>`` element
"""
self.state_machine.transition("file_description")
fd: FileDescription = self.FileDescription(
file_contents, [self.SourceFile.ensure(sf) for sf in ensure_iterable(source_files)],
contacts=[self.Contact.ensure(c) for c in ensure_iterable(contacts)])
native_id_format = fd.content.native_id_format
if native_id_format and not self.native_id_format_configured:
self.native_id_format = NativeIDParser.from_term(native_id_format)
elif self.native_id_format_configured:
fd.content.add_param(self.native_id_format.name)
fd.write(self.writer)
def instrument_configuration_list(self, instrument_configurations):
"""Writes the ``<instrumentConfigurationList>`` section of the document.
.. note::
List and descriptions of instrument configurations. At least one instrument configuration MUST
be specified, even if it is only to specify that the instrument is unknown. In that case, the
"instrument model" term is used to indicate the unknown instrument in the instrumentConfiguration
Parameters
----------
instrument_configurations : list
A list or other iterable of :class:`dict` or :class:`~.InstrumentConfiguration`-like
objects
"""
self.state_machine.transition("instrument_configuration_list")
configs = [
self.InstrumentConfiguration.ensure(ic) if not isinstance(
ic, InstrumentConfiguration) else ic
for ic in ensure_iterable(
instrument_configurations)]
self.InstrumentConfigurationList(configs).write(self)
def data_processing_list(self, data_processing):
"""Writes the ``<dataProcessingList>`` section of the document.
.. note::
List and descriptions of data processing applied to this data
Parameters
----------
data_processing : list
A list or other iterable of :class:`dict` or :class:`~.DataProcessing`-like
objects
"""
self.state_machine.transition("data_processing_list")
methods = [
self.DataProcessing.ensure(dp) for dp in ensure_iterable(data_processing)]
self.DataProcessingList(methods).write(self)
def reference_param_group_list(self, groups):
"""Writes the ``<referenceableParamGroupList>`` section of the document.
Parameters
----------
groups : list
A list or other iterable of :class:`dict` or :class:`~.ReferenceableParamGroup`-like
objects
"""
self.state_machine.transition("reference_param_group_list")
groups = [
self.ReferenceableParamGroup.ensure(g) for g in ensure_iterable(groups)]
self.ReferenceableParamGroupList(groups).write(self)
def sample_list(self, samples):
"""Writes the ``<sampleList>`` section of the document
Parameters
----------
samples : list
A list or other iterable of :class:`dict` or :class:`~.mzml.components.Sample`-like
objects
"""
self.state_machine.transition("sample_list")
for i, sample in enumerate(ensure_iterable(samples)):
if isinstance(sample, Mapping):
sample_id = sample.get('id')
sample_name = sample.get("name")
if sample_id is None and sample_name is not None:
sample_id = "%s_%d_id" % (sample_name, i)
elif sample_id is not None and sample_name is None:
sample_name = str(sample_id)
elif sample_id is sample_name is None:
sample_id = "sample_%d_id" % (i,)
sample_name = "sample_%d" % (i,)
sample['id'] = sample_id
sample['name'] = sample_name
sample_entries = self.Sample.ensure_all(samples)
self.SampleList(sample_entries).write(self)
def scan_settings_list(self, scan_settings):
self.state_machine.transition("scan_settings_list")
scan_settings = self.ScanSettings.ensure_all(scan_settings)
self.ScanSettingsList(scan_settings).write(self)
def run(self, id=None, instrument_configuration=None, source_file=None, start_time=None, sample=None):
"""Begins the `<run>` section of the document, describing a single
sample run.
Parameters
----------
id : str, optional
The unique identifier for this element
instrument_configuration : str, optional
The id string for the default `InstrumentConfiguration` for this
sample
source_file : str, optional
The id string for the source file used to produce this data
start_time : str, optional
A string encoding the date and time the sample was acquired
sample: str, optional
The id string for the sample used to produce this data
Returns
-------
RunSection
"""
self.state_machine.transition("run")
kwargs = {}
if start_time is not None:
kwargs['startTimeStamp'] = start_time
if instrument_configuration is None:
keys = list(self.context['InstrumentConfiguration'].keys())
if keys:
instrument_configuration = keys[0]
else:
instrument_configuration = None
self.default_instrument_configuration = instrument_configuration
return RunSection(
self.writer, self.context, id=id,
instrument_configuration=instrument_configuration,
source_file=source_file,
sample=sample, **kwargs)
def spectrum_list(self, count, data_processing_method=None):
self.state_machine.transition('spectrum_list')
if data_processing_method is None:
dp_map = self.context['DataProcessing']
try:
data_processing_method = list(dp_map.keys())[0]
except IndexError:
warnings.warn(
"No Data Processing method found. mzML file may not be fully standard-compliant",
stacklevel=2)
return SpectrumListSection(
self.writer, self.context, count=count,
data_processing_method=data_processing_method)
def chromatogram_list(self, count, data_processing_method=None):
self.state_machine.transition('chromatogram_list')
if data_processing_method is None:
dp_map = self.context['DataProcessing']
try:
data_processing_method = list(dp_map.keys())[0]
except IndexError:
warnings.warn(
"No Data Processing method found. mzML file may not be fully standard-compliant",
stacklevel=2)
return ChromatogramListSection(
self.writer, self.context, count=count,
data_processing_method=data_processing_method)
def spectrum(self, mz_array: Optional[np.ndarray] = None, intensity_array: Optional[np.ndarray] = None,
charge_array: Optional[np.ndarray] = None, id: Optional[str] = None,
polarity='positive scan', centroided=True, precursor_information=None,
scan_start_time=None, params=None, compression=COMPRESSION_ZLIB,
encoding=None, other_arrays=None, scan_params=None, scan_window_list=None,
instrument_configuration_id=None, intensity_unit=DEFAULT_INTENSITY_UNIT) -> Spectrum:
'''Create a new :class:`~.Spectrum` instance to be written.
This method does not immediately write and close the spectrum element, leaving it
open for modification and embedding.
Parameters
----------
mz_array: :class:`np.ndarray` of floats
The m/z array of the spectrum
intensity_array: :class:`np.ndarray` of floats
The intensity array of the spectrum
charge_array: :class:`np.ndarray`, optional
The charge state array of the spectrum, optional.
id: str
The native ID of the spectrum.
polarity: str or int, optional
The polarity of the spectrum. If an integer, the sign of
the integer is used, otherwise it is interpreted as a cvParam
centroided: bool, optional
Whether the spectrum is continuous or discretized by peak picking.
Defaults to :const:`True`.
precursor_information: dict or :class:`PrecursorBuilder`, optional
The precursor ion description. Will be passed to :meth:`_prepare_precursor_list`.
The structure of this object should either be formatted as arguments to
:meth:`precursor_builder`, or a :class:`PrecursorBuilder` instance populated
with information.
scan_start_time: float, optional
The scan start time, in minutes
params: list, optional
The parameters of the `spectrum`
compression: str, optional
The compression type name to use. Defaults to `COMPRESSION_ZLIB`.
encoding: dict, optional
A mapping from array name to NumPy data types.
other_arrays: list, optional
An iterable of array names to additional data arrays. Array names may either be
strings, :class:`Mapping` objects that define :class:`~.CVParam` or :class:`~.UserParam`,
or such paramter objects themselves. Use the latter two methods when defining arrays with
units.
scan_params: list, optional
A list of cvParams for the `scan` of this `spectrum`
scan_window_list: list, optional
A list of scan windows specified as pairs of m/z intervals
instrument_configuration_id: str, optional
The `id` of the `instrumentConfiguration` to associate with this spectrum
if not the default one.
Returns
-------
:class:`~.Spectrum`
See Also
--------
:meth:`write_spectrum`
:meth:`chromatogram`
:meth:`write_chromatogram`
'''
self.state_machine.expects_state("spectrum_list")
if encoding is None:
encoding = {MZ_ARRAY: np.float64, CHARGE_ARRAY: np.int32}
if params is None:
params = []
else:
params = list(params)
if scan_params is None:
scan_params = []
else:
scan_params = list(scan_params)
if other_arrays is None:
other_arrays = []
if scan_window_list is None:
scan_window_list = []
else:
scan_window_list = list(scan_window_list)
if isinstance(encoding, Mapping):
encoding = DefaultDict(lambda: np.float32, encoding)
else:
# create new variable to capture in closure
_encoding = encoding
encoding = DefaultDict(lambda: _encoding)
if polarity is not None:
if isinstance(polarity, int):
if polarity > 0:
polarity = 'positive scan'
elif polarity < 0:
polarity = 'negative scan'
else:
polarity = None
elif 'positive' in polarity:
polarity = 'positive scan'
elif 'negative' in polarity:
polarity = 'negative scan'
else:
polarity = None
if polarity not in params and polarity is not None:
params.append(polarity)
if centroided:
peak_mode = "centroid spectrum"
else:
peak_mode = 'profile spectrum'
params.append(peak_mode)
array_list = []
default_array_length = len(mz_array) if mz_array is not None else 0
if mz_array is not None:
mz_array_tag = self._prepare_array(
mz_array, encoding=encoding[MZ_ARRAY], compression=compression, array_type=MZ_ARRAY, scope='spectrum')
array_list.append(mz_array_tag)
if intensity_array is not None:
intensity_array_tag = self._prepare_array(
intensity_array, encoding=encoding[INTENSITY_ARRAY], compression=compression,
array_type={"name": INTENSITY_ARRAY, "unit_name": intensity_unit}, scope='spectrum')
array_list.append(intensity_array_tag)
if charge_array is not None:
charge_array_tag = self._prepare_array(
charge_array, encoding=encoding[CHARGE_ARRAY], compression=compression,
array_type=CHARGE_ARRAY, scope='spectrum')
array_list.append(charge_array_tag)
for array_type, array in other_arrays:
if array_type is None:
raise ValueError("array type can't be None")
if isinstance(array_type, Mapping):
array_name = array_type['name']
else:
array_name = array_type
array_tag = self._prepare_array(
array, encoding=encoding[array_name], compression=compression, array_type=array_type,
default_array_length=default_array_length, scope='spectrum')
array_list.append(array_tag)
array_list_tag = self.BinaryDataArrayList(array_list)
if precursor_information is not None:
precursor_list = self._prepare_precursor_list(
precursor_information, intensity_unit=intensity_unit)
else:
precursor_list = None
if scan_start_time is not None:
if isinstance(scan_start_time, numbers.Number):
scan_params.append({"name": "scan start time",
"value": scan_start_time,
"unitName": DEFAULT_TIME_UNIT})
else:
scan_params.append(scan_start_time)
# The spec says this is optional, but the validator calls this a must
# if self.default_instrument_configuration == instrument_configuration_id:
# instrument_configuration_id = None
scan = self.Scan(scan_window_list=scan_window_list, params=scan_params,
instrument_configuration_ref=instrument_configuration_id)
scan_list = self.ScanList([scan], params=["no combination"])
index = self.spectrum_count
self.spectrum_count += 1
spectrum = self.Spectrum(
index, array_list_tag, scan_list=scan_list, params=params, id=id,
default_array_length=default_array_length,
precursor_list=precursor_list)
return spectrum
def write_spectrum(self, mz_array=None, intensity_array=None, charge_array=None, id=None,
polarity='positive scan', centroided=True, precursor_information=None,
scan_start_time=None, params=None, compression=COMPRESSION_ZLIB,
encoding=None, other_arrays=None, scan_params=None, scan_window_list=None,
instrument_configuration_id=None, intensity_unit=DEFAULT_INTENSITY_UNIT):
'''Write a :class:`~.Spectrum` with the provided data.
To create a spectrum element but not immediately close it off, see the :meth:`spectrum` method.
Parameters
----------
mz_array: :class:`np.ndarray` of floats
The m/z array of the spectrum
intensity_array: :class:`np.ndarray` of floats
The intensity array of the spectrum
charge_array: :class:`np.ndarray`, optional
The charge state array of the spectrum, optional.
id: str
The native ID of the spectrum.
polarity: str or int, optional
The polarity of the spectrum. If an integer, the sign of
the integer is used, otherwise it is interpreted as a cvParam
centroided: bool, optional
Whether the spectrum is continuous or discretized by peak picking.
Defaults to :const:`True`.
precursor_information: dict or :class:`PrecursorBuilder`, optional
The precursor ion description. Will be passed to :meth:`_prepare_precursor_list`.
The structure of this object should either be formatted as arguments to
:meth:`precursor_builder`, or a :class:`PrecursorBuilder` instance populated
with information.
scan_start_time: float, optional
The scan start time, in minutes
params: list, optional
The parameters of the `spectrum`
compression: str, optional
The compression type name to use. Defaults to `COMPRESSION_ZLIB`.
encoding: dict, optional
A mapping from array name to NumPy data types.
other_arrays: list, optional
An iterable of array names to additional data arrays. Array names may either be
strings, :class:`Mapping` objects that define :class:`~.CVParam` or :class:`~.UserParam`,
or such paramter objects themselves. Use the latter two methods when defining arrays with
units.
scan_params: list, optional
A list of cvParams for the `scan` of this `spectrum`
scan_window_list: list, optional
A list of scan windows specified as pairs of m/z intervals
instrument_configuration_id: str, optional
The `id` of the `instrumentConfiguration` to associate with this spectrum
if not the default one.
See Also
--------
:meth:`spectrum`
'''
spectrum = self.spectrum(
mz_array=mz_array, intensity_array=intensity_array, charge_array=charge_array,
id=id, polarity=polarity, centroided=centroided, precursor_information=precursor_information,
scan_start_time=scan_start_time, params=params, compression=compression,
encoding=encoding, other_arrays=other_arrays, scan_params=scan_params,
scan_window_list=scan_window_list,
instrument_configuration_id=instrument_configuration_id,
intensity_unit=intensity_unit)
spectrum.write(self.writer)
def chromatogram(self, time_array, intensity_array, id=None,
chromatogram_type="selected ion current",
precursor_information=None, params=None,
compression=COMPRESSION_ZLIB, encoding=32, other_arrays=None,
intensity_unit=DEFAULT_INTENSITY_UNIT, time_unit=DEFAULT_TIME_UNIT):
self.state_machine.expects_state("chromatogram_list")
if params is None:
params = []
else:
params = list(params)
if isinstance(encoding, Mapping):
encoding = defaultdict(lambda: np.float32, encoding)
else:
# create new variable to capture in closure
_encoding = encoding
encoding = defaultdict(lambda: _encoding)
if other_arrays is None:
other_arrays = []
array_list = []
if precursor_information is not None:
precursor = self._prepare_precursor_list(
precursor_information, intensity_unit=intensity_unit)[0]
else:
precursor = None
default_array_length = len(time_array) if time_array is not None else 0
if time_array is not None:
time_array_tag = self._prepare_array(
time_array, encoding=encoding[TIME_ARRAY], compression=compression,
array_type={"name": TIME_ARRAY, "unit_name": time_unit}, scope='chromatogram')
array_list.append(time_array_tag)
if intensity_array is not None:
intensity_array_tag = self._prepare_array(
intensity_array, encoding=encoding[INTENSITY_ARRAY], compression=compression,
array_type={"name": INTENSITY_ARRAY, "unit_name": intensity_unit}, scope='chromatogram')
array_list.append(intensity_array_tag)
for array_type, array in other_arrays:
if isinstance(array_type, Mapping):
array_name = array_type['name']
else:
array_name = array_type
array_tag = self._prepare_array(
array, encoding=encoding[array_name], compression=compression, array_type=array_type,
default_array_length=default_array_length, scope='chromatogram')
array_list.append(array_tag)
params.append(chromatogram_type)
array_list_tag = self.BinaryDataArrayList(array_list)
index = self.chromatogram_count
self.chromatogram_count += 1
chromatogram = self.Chromatogram(
index=index, binary_data_list=array_list_tag,
precursor=precursor,
default_array_length=default_array_length,
id=id, params=params)
return chromatogram
def write_chromatogram(self, time_array, intensity_array, id=None,
chromatogram_type="selected ion current",
precursor_information=None, params=None,
compression=COMPRESSION_ZLIB, encoding=32, other_arrays=None,
intensity_unit=DEFAULT_INTENSITY_UNIT, time_unit=DEFAULT_TIME_UNIT):
chromatogram = self.chromatogram(
time_array=time_array, intensity_array=intensity_array, id=id,
chromatogram_type=chromatogram_type, precursor_information=precursor_information,
params=params, compression=compression, encoding=encoding,
other_arrays=other_arrays, intensity_unit=intensity_unit, time_unit=time_unit)
chromatogram.write(self.writer)
def _prepare_array(self, array, encoding=32, compression=COMPRESSION_ZLIB,
array_type=None, default_array_length=None, scope=None):
if isinstance(encoding, numbers.Number):
_encoding = int(encoding)
else:
_encoding = encoding
dtype = encoding_map[_encoding]
if dtype is None:
if len(array):
val = array[0]
dtype = encoding_map[val.__class__]
else:
try:
dtype = array.dtype
except AttributeError:
dtype = np.float32
array = np.array(array, dtype=dtype)
encoded_binary = encode_array(
array, compression=compression, dtype=dtype)
binary = self.Binary(encoded_binary)
if default_array_length is not None and len(array) != default_array_length:
override_length = True
else:
override_length = False
params = []
if array_type is not None:
params.append(array_type)
if isinstance(array_type, Mapping):
array_type_ = array_type['name']
else:
array_type_ = array_type
if array_type_ not in ARRAY_TYPES:
params.append({"name": NON_STANDARD_ARRAY, "value": array_type_})
params.append(compression_map[compression])
params.append(dtype_to_encoding[dtype])
encoded_length = len(encoded_binary)
return self.BinaryDataArray(
binary, encoded_length,
array_length=(len(array) if override_length else None),
params=params)
def _prepare_precursor_list(self, precursors, intensity_unit=DEFAULT_INTENSITY_UNIT):
if isinstance(precursors, self.PrecursorList.type):
return precursors
elif isinstance(precursors, (dict)):
precursors = self.PrecursorList([self.prepare_precursor_information(
intensity_unit=intensity_unit, **precursors)])
elif isinstance(precursors, PrecursorBuilder):
precursors = self.PrecursorList([self.prepare_precursor_information(
precursors,
intensity_unit=intensity_unit)])
else:
packaged = []
for p in ensure_iterable(precursors):
if isinstance(p, self.Precursor.type):
packaged.append(p)
elif isinstance(p, dict):
packaged.append(
self.prepare_precursor_information(
intensity_unit=intensity_unit, **p))
elif isinstance(p, PrecursorBuilder):
packaged.append(
self._prepare_precursor_information(
p, intensity_unit=intensity_unit))
precursors = self.PrecursorList(packaged)
return precursors
def prepare_precursor_information(self, mz=None, intensity=None, charge=None, spectrum_reference=None, activation=None,
isolation_window_args=None, params=None,
intensity_unit=DEFAULT_INTENSITY_UNIT, scan_id=None, external_spectrum_id=None,
source_file_reference=None, **kwargs):
'''Prepare a :class:`Precursor` element from disparate data structures.
Parameters
----------
mz: float, optional
The m/z of the first selected ion
intensity: float, optional
The intensity of the first selected ion
charge: int, optional
The charge state of the first seelcted ion
spectrum_reference: str, optional
The `id` of the prescursor `<spectrum>` for this precursor
activation: list, optional
A list of parameters describing the ion activation method used.
isolation_window_args: tuple, list, or dict, optional
Parameters forwarded to :meth:`PrecursorBuilder.isolation_window`,
tuple or list values are converted into :class:`dict` of the correct
structure. This argument may also be passed as `isolation_window`.
params: list, optional
The cvParams of the first selected ion
intensity_unit: str
The intensity unit of the first selected ion
scan_id: str, optional
An alias for `spectrum_reference`
external_spectrum_id: str, optional
The `externalSpectrumID` attribute of the precursor
source_file_reference: str, optional
The `sourceFileRef` attribute of the precursor
Returns
-------
:class:`~.Precursor`
'''
if isinstance(mz, PrecursorBuilder):
return self.Precursor(**mz.pack())
if isinstance(mz, dict):
return self.Precursor(**mz)
if isolation_window_args is None:
isolation_window_args = kwargs.get("isolation_window")
if mz is None:
mz = kwargs.get("selected_ion_mz")
if scan_id is not None:
spectrum_reference = scan_id
if params is None:
params = []
if activation:
activation = self.Activation(activation)
if any((mz, intensity, charge)):
ion = self.SelectedIon(mz, intensity, charge, params=params)
ion_list = self.SelectedIonList([ion])
else:
ion_list = None
if isolation_window_args:
if isinstance(isolation_window_args, (list, tuple)):
isolation_window_tag = self.IsolationWindow(*isolation_window_args)
else:
isolation_window_tag = self.IsolationWindow(**isolation_window_args)
else:
isolation_window_tag = None
precursor = self.Precursor(
ion_list,
activation=activation,
isolation_window=isolation_window_tag,
spectrum_reference=spectrum_reference)
return precursor
def precursor_builder(self, mz=None, intensity=None, charge=None, spectrum_reference=None, activation=None,
isolation_window_args=None, params=None,
intensity_unit=DEFAULT_INTENSITY_UNIT, scan_id=None,
external_spectrum_id=None, source_file_reference=None,
isolation_window=None):
'''Create a :class:`PrecursorBuilder`, an object to help populate the precursor information
data structure.
The helper object should be used to incrementally populate the precursor information passed
to :meth:`spectrum` or :meth:`write_spectrum`'s `precursor_information` argument.
Parameters
----------
mz: float, optional
The m/z of the first selected ion
intensity: float, optional
The intensity of the first selected ion
charge: int, optional
The charge state of the first selected ion
spectrum_reference: str, optional
The `id` of the prescursor `<spectrum>` for this precursor, mapped through the
document context.
activation: dict or list, optional
Parameters forwarded to :meth:`PrecursorBuilder.activation`. This should be a dictionary
with a key "params" and a list of :class:`~.CVParam` coerce-able values, with additional
optional keys naming other :class:`~.CVParam` coerce-able values. If a :class:`list` is
passed, it will be wrapped in one e.g. ``{"params": activation}``
isolation_window_args: tuple, list, or dict, optional
Parameters forwarded to :meth:PrecursorBuilder.isolation_window`,
tuple or list of three values are converted into :class:`dict` of the correct
structure. The expected keys are "lower", the lower m/z offset, "target", the center m/z,
and "upper", the upper m/z offset. You may also pass this argumemt as `isolation_window`.
params: list, optional
The cv- and user-params of the first selected ion, in addition to `mz`, `intensity`,
`charge`.
intensity_unit: str
The intensity unit of the first selected ion, to be specified with `intensity`
scan_id: str, optional
An alias for `spectrum_reference`
external_spectrum_id: str, optional
The `externalSpectrumID` attribute of the precursor
source_file_reference: str, optional
The `sourceFileRef` attribute of the precursor
Returns
-------
:class:`PrecursorBuilder`
'''
if isolation_window_args is None:
isolation_window_args = isolation_window
if scan_id is None:
spectrum_reference = scan_id
inst = PrecursorBuilder(
self, spectrum_reference=spectrum_reference,
external_spectrum_id=external_spectrum_id)
if mz is not None or intensity is not None or charge is not None or params is not None:
inst.selected_ion(
selected_ion_mz=mz, intensity=intensity, charge=charge,
intensity_unit=intensity_unit, params=params)
if isolation_window_args is None:
if isinstance(isolation_window_args, (tuple, list)):
isolation_window_args = {
"lower": isolation_window_args[0],
"target": isolation_window_args[1],
"upper": isolation_window_args[2]}
inst.isolation_window(isolation_window_args)
if activation is not None:
if isinstance(activation, (list, tuple)):
activation = {'params': activation}
inst.activation(activation)
return inst
class SelectedIonBuilder(ElementBuilder):
mz = ParamManagingProperty('selected_ion_mz', 0.0, aliases=['mz'])
charge = ParamManagingProperty('charge')
intensity = ParamManagingProperty('intensity', 0.0)
intensity_unit = ParamManagingProperty(
'intensity_unit', DEFAULT_INTENSITY_UNIT)
class IsolationWindowBuilder(ElementBuilder):
lower = ParamManagingProperty('lower')
target = ParamManagingProperty('target')
upper = ParamManagingProperty('upper')
class ActivationBuilder(ElementBuilder):
pass
class PrecursorBuilder(ElementBuilder):
def __init__(self, source, binding=None, params=None, **kwargs):
super(PrecursorBuilder, self).__init__(
source, binding, params, **kwargs)
selected_ion_list = ParamManagingProperty("selected_ion_list", list)
_isolation_window = ParamManagingProperty("isolation_window")
_activation = ParamManagingProperty('activation')
spectrum_reference = ParamManagingProperty('spectrum_reference')
source_file_reference = ParamManagingProperty('source_file_reference')
external_spectrum_id = ParamManagingProperty('external_spectrum_id', aliases=['scan_id'])
def selected_ion(self, binding=None, **kwargs):
sib = SelectedIonBuilder(self.source, binding=binding, **kwargs)
self.selected_ion_list.append(sib)
return sib
def isolation_window(self, binding=None, **kwargs):
self._isolation_window = IsolationWindowBuilder(
self.source, binding=binding, **kwargs)
return self._isolation_window
def activation(self, binding=None, **kwargs):
self._activation = ActivationBuilder(
self.source, binding=binding, **kwargs)
return self._activation
[docs]class IndexedMzMLWriter(PlainMzMLWriter):
"""A high level API for generating indexed mzML XML files from simple Python objects.
This class depends heavily on :mod:`lxml`'s incremental file writing API which in turn
depends heavily on context managers. Almost all logic is handled inside a context
manager and in the context of a particular document. Since all operations assume
that they have access to a universal identity map for each element in the document,
that map is centralized in this class.
`MzMLWriter` inherits from :class:`.ComponentDispatcher`, giving it a :attr:`context`
attribute and access to all `Component` objects pre-bound to that context with attribute-access
notation.
Attributes
----------
chromatogram_count : int
A count of the number of chromatograms written
spectrum_count : int
A count of the number of spectra written
index_builder : :class:`~.IndexingStream`
A writing stream that automatically tokenizes and records byte offsets for
specific XML tags.
"""
def __init__(self, outfile, close=None, vocabularies=None, missing_reference_is_error=False,
vocabulary_resolver=None, id=None, accession=None, **kwargs):
outfile = IndexingStream(outfile)
super(IndexedMzMLWriter, self).__init__(
outfile, close, vocabularies, missing_reference_is_error, vocabulary_resolver,
id, accession, **kwargs)
self.index_builder = outfile
def toplevel_tag(self):
return IndexedmzMLSection(
self.writer, self.context, id=self.id, accession=self.accession,
indexer=self.index_builder)
MzMLWriter = IndexedMzMLWriter
# MzMLWriter = PlainMzMLWriter