Source code for txpipe.ingest.mocks

from ..base_stage import PipelineStage
from ..data_types import ShearCatalog, HDFFile, TextFile, QPPDFFile
from ..shear_calibration import band_variants, metacal_variants, metadetect_variants
from ceci.config import StageParameter
import numpy as np



[docs]
class TXCosmoDC2Mock(PipelineStage):
    """
    Simulate mock shear and photometry measurements from CosmoDC2 (or similar)

    This stage simulates metacal data and metacalibrated
    photometry measurements, starting from a cosmology catalogs
    of the kind used as an input to DC2 image and obs-catalog simulations.

    This is mainly useful for testing infrastructure in advance
    of the DC2 catalogs being available, but might also be handy
    for starting from a purer simulation.
    """

    name = "TXCosmoDC2Mock"
    parallel = False
    inputs = [("response_model", HDFFile)]

    outputs = [
        ("shear_catalog", ShearCatalog),
        ("photometry_catalog", HDFFile),
    ]

    config_options = {
        "cat_name": StageParameter(str, "cosmoDC2", msg="Name of the mock catalog to use."),
        "visits_per_band": StageParameter(int, 165, msg="Number of visits per band for noise simulation."),
        "snr_limit": StageParameter(float, 4.0, msg="S/N limit for object selection."),
        "max_size": StageParameter(int, 99999999999999, msg="Maximum catalog size for testing."),
        "extra_cols": StageParameter(str, "", msg="Extra columns to include (space-separated)."),
        "max_npix": StageParameter(int, 99999999999999, msg="Maximum number of pixels."),
        "unit_response": StageParameter(bool, False, msg="Whether to use unit response in simulation."),
        "cat_size": StageParameter(int, 0, msg="Pre-computed catalog size, if known."),
        "flip_g2": StageParameter(
            bool, True, msg="Whether to flip g2 sign to match conventions. TreeCorr and NaMaster use flip_g2=True"
        ),
        "apply_mag_cut": StageParameter(bool, False, msg="Apply magnitude cut for descqa comparison."),
        "Mag_r_limit": StageParameter(float, -19, msg="Magnitude r limit for object selection."),
        "Mag_i_limit": StageParameter(float, -19, msg="Magnitude i limit for object selection."),
        "metadetect": StageParameter(
            bool, True, msg="Whether to make a metadetect-style catalog (True) or metacal (False)."
        ),
        "add_shape_noise": StageParameter(bool, True, msg="Whether to add shape noise to simulation."),
        "healpixels": StageParameter(list, [-1], msg="List of HEALPix pixels to use."),
    }

    def data_iterator(self, gc):
        # Columns we need from the cosmo simulation
        cols = [
            "mag_true_u_lsst",
            "mag_true_g_lsst",
            "mag_true_r_lsst",
            "mag_true_i_lsst",
            "mag_true_z_lsst",
            "mag_true_y_lsst",
            "ra",
            "dec",
            "ellipticity_1_true",
            "ellipticity_2_true",
            "shear_1",
            "shear_2",
            "size_true",
            "galaxy_id",
            "redshift_true",
            "convergence",
        ]
        # Add any extra requestd columns
        cols += self.config["extra_cols"].split()

        it = gc.get_quantities(cols, return_iterator=True)
        nfile = len(gc._file_list) if hasattr(gc, "_file_list") else 0

        for i, data in enumerate(it):
            if nfile:
                j = i + 1
                print(f"Loading chunk {j}/{nfile}")
            yield data

    def load_catalog(self):
        import GCRCatalogs

        cat_name = self.config["cat_name"]
        self.bands = ("u", "g", "r", "i", "z", "y")

        print(f"Loading from catalog {cat_name}")

        kw = {}
        if self.config["healpixels"] != [-1]:
            kw = {"config_overwrite": {"healpix_pixels": self.config["healpixels"]}}

        gc = GCRCatalogs.load_catalog(cat_name, **kw)

        # GCR sometimes tries to read the entire catalog
        # to measure its length rather than looking at metadata
        # this can take a very long time.
        # allow the user to say that already know it.
        N = self.config["cat_size"]
        if N == 0:
            N = len(gc)

        return gc, N

    def run(self):
        gc, N = self.load_catalog()

        print(f"Rank {self.rank} loaded: length = {N}.")

        target_size = min(N, self.config["max_size"])
        select_fraction = target_size / N

        if target_size != N:
            print(f"Will select approx {100 * select_fraction:.2f}% of objects ({target_size})")

        # Prepare output files
        metacal_file = self.open_output("shear_catalog", parallel=self.is_mpi())
        photo_file = self.open_output("photometry_catalog", parallel=self.is_mpi())
        photo_cols = self.setup_photometry_output(photo_file, target_size)
        if self.config["metadetect"]:
            metacal_cols = self.setup_metadetect_output(metacal_file, target_size)
        else:
            metacal_cols = self.setup_metacal_output(metacal_file, target_size)

        # Load the metacal response file
        self.load_metacal_response_model()

        # Keep track of catalog position
        start = 0
        count = 0

        # Loop through chunks of
        for data in self.data_iterator(gc):
            # The initial chunk size, of all the input data.
            # This will be reduced later as we remove objects
            some_col = list(data.keys())[0]
            chunk_size = len(data[some_col])
            print(f"Process {self.rank} read chunk {count} - {count + chunk_size} of {N}")
            count += chunk_size
            # Select a random fraction of the catalog if we are cutting down
            # We can't just take the earliest galaxies because they are ordered
            # by redshift
            if target_size != N:
                select = np.random.uniform(size=chunk_size) < select_fraction
                nselect = select.sum()
                print(f"Cutting down to {nselect}/{chunk_size} objects")
                for name in list(data.keys()):
                    data[name] = data[name][select]
            if self.config["apply_mag_cut"]:
                self.apply_magnitude_cut(data)

            # Simulate the various output data sets
            mock_photometry = self.make_mock_photometry(data)

            # Cut out any objects too faint to be detected and measured.
            # We have to do this after the photometry, so that we know if
            # the object is detected, but we can do it before making the mock
            # metacal info, saving us some time simulating un-needed objects
            if self.config["snr_limit"] > 0:  # otherwise there is no need to run this function which is slow
                self.remove_undetected(data, mock_photometry)
            if self.config["metadetect"]:
                mock_shear = self.make_mock_metadetect(data, mock_photometry)
            else:
                mock_shear = self.make_mock_metacal(data, mock_photometry)

            # The chunk size has now changed
            some_col = list(mock_photometry.keys())[0]
            chunk_size = len(mock_photometry[some_col])

            # start is where this process should start writing this
            # chunk of data.  end is where the final process will finish
            # writing, becoming the starting point for the whole next
            # chunk over all the processes
            start, end = self.next_output_indices(start, chunk_size)

            # Save all output
            self.write_output(
                start,
                target_size,
                photo_cols,
                metacal_cols,
                photo_file,
                mock_photometry,
                metacal_file,
                mock_shear,
            )

            # The next iteration starts writing where the current one ends.
            start = end

            if start >= target_size:
                break
        # Tidy up

        photo_file.close()
        metacal_file.close()

        self.truncate_outputs(end)

    def truncate_outputs(self, n):
        import h5py

        if self.comm is not None:
            self.comm.Barrier()

        def visitor(name, node):
            if isinstance(node, h5py.Dataset):
                print(f"Resizing {name}")
                node.resize((n,))

        if self.rank == 0:
            # all files should now be closed for all procs
            print(f"Resizing all outupts to size {n}")

            with h5py.File(self.get_output("photometry_catalog"), "r+") as f:
                f["photometry"].visititems(visitor)

            with h5py.File(self.get_output("shear_catalog"), "r+") as f:
                f["shear"].visititems(visitor)

    def next_output_indices(self, start, chunk_size):
        if self.comm is None:
            end = start + chunk_size
        else:
            all_indices = self.comm.allgather(chunk_size)
            starting_points = np.concatenate(([0], np.cumsum(all_indices)))
            # use the old start to find the end point.
            # the final starting point (not used below, since it is larger
            # than the largest self.rank value) is the total data length
            end = start + starting_points[-1]
            start = start + starting_points[self.rank]
        print(f"- Rank {self.rank} writing output to {start}-{start + chunk_size}")
        return start, end

    def setup_photometry_output(self, photo_file, target_size):
        # Get a list of all the column names
        cols = ["ra", "dec", "extendedness"]
        for band in self.bands:
            cols.append(f"mag_{band}")
            cols.append(f"mag_{band}_err")
            cols.append(f"snr_{band}")

        for col in self.config["extra_cols"].split():
            if col not in cols:
                cols.append(col)
        else:
            print(f"Warning: extra col '{col}' already present; skipping duplicate")

        # Make group for all the photometry
        group = photo_file.create_group("photometry")
        group.attrs["bands"] = self.bands

        # Extensible columns becase we don't know the size yet.
        # We will cut down the size at the end.
        for col in cols:
            group.create_dataset(col, (target_size,), maxshape=(target_size,), dtype="f8")

        # The only non-float column for now
        group.create_dataset("id", (target_size,), maxshape=(target_size,), dtype="i8")

        return cols + ["id"]

    def setup_metadetect_output(self, metacal_file, target_size):
        # Get a list of all the column names
        cols = metadetect_variants(
            "g1",
            "g2",
            "T",
            "s2n",
            "T_err",
            "ra",
            "dec",
            "psf_g1",
            "psf_g2",
            "psf_T_mean",
            "weight",
        ) + band_variants("riz", "mag", "mag_err", shear_catalog_type="metadetect")

        # Store the truth shear only for the primary catalog, true redshift for all variants
        cols += ["00/true_g1", "00/true_g2"] + metadetect_variants("redshift_true")

        # Make group for all the photometry
        group = metacal_file.create_group("shear")
        group.attrs["bands"] = self.bands

        # Extensible columns becase we don't know the size yet.
        # We will cut down the size at the end.
        for col in cols:
            group.create_dataset(col, (target_size,), maxshape=(target_size,), dtype="f8")

        # Integer columns
        int_cols = metadetect_variants("id", "flags")
        for col in int_cols:
            group.create_dataset(col, (target_size,), maxshape=(target_size,), dtype="i8")

        return cols + int_cols

    def setup_metacal_output(self, metacal_file, target_size):
        # Get a list of all the column names
        cols = (
            [
                "ra",
                "dec",
                "psf_g1",
                "psf_g2",
                "psf_T_mean",
            ]
            + metacal_variants("g1", "g2", "T", "s2n", "T_err")
            + band_variants("riz", "mag", "mag_err", shear_catalog_type="metacal")
            + ["weight"]
        )

        cols += ["true_g1", "true_g2", "redshift_true"]

        # Make group for all the photometry
        group = metacal_file.create_group("shear")
        group.attrs["bands"] = self.bands

        # Extensible columns becase we don't know the size yet.
        # We will cut down the size at the end.

        for col in cols:
            group.create_dataset(col, (target_size,), maxshape=(target_size,), dtype="f8")

        group.create_dataset("id", (target_size,), maxshape=(target_size,), dtype="i8")

        for col in metacal_variants("flags"):
            group.create_dataset(col, (target_size,), maxshape=(target_size,), dtype="i8")

        return cols + ["id", "flags"]

    def load_metacal_response_model(self):
        """
        Load an HDF file containing the response model
        R(log10(snr), size)
        R_std(log10(snr), size)

        where R is the mean metacal response in a bin and
        R_std is its standard deviation.

        So far only one of these files exists!

        """
        import scipy.interpolate

        if self.config["unit_response"]:
            return

        model_file = self.open_input("response_model")
        snr_centers = model_file["R_model/log10_snr"][:]
        sz_centers = model_file["R_model/size"][:]
        R_mean = model_file["R_model/R_mean"][:]
        R_std = model_file["R_model/R_std"][:]
        model_file.close()

        # Save a 2D spline
        snr_grid, sz_grid = np.meshgrid(snr_centers, sz_centers)
        self.R_spline = scipy.interpolate.SmoothBivariateSpline(
            snr_grid.T.flatten(),
            sz_grid.T.flatten(),
            R_mean.flatten(),
            w=R_std.flatten(),
        )
        self.Rstd_spline = scipy.interpolate.SmoothBivariateSpline(
            snr_grid.T.flatten(), sz_grid.T.flatten(), R_std.flatten()
        )

    def write_output(
        self,
        start,
        target_size,
        photo_cols,
        metacal_cols,
        photo_file,
        photo_data,
        metacal_file,
        metacal_data,
    ):
        """
        Save the photometry we have just simulated to disc

        Parameters
        ----------

        photo_file: HDF File object

        metacal_file: HDF File object

        photo_data: dict
            Dictionary of simulated photometry

        metacal_data: dict
            Dictionary of simulated metacal data

        """
        # Work out the range of data to output (since we will be
        # doing this in chunks). If we have cut down to a random
        # subset of the catalog then we may have gone over the
        # target length, depending on the random selection
        n = len(photo_data["id"])
        end = min(start + n, target_size)

        # assert photo_data['id'].min()>0

        # Save each column
        for name in photo_cols:
            photo_file[f"photometry/{name}"][start:end] = photo_data[name]

        for name in metacal_cols:
            metacal_file[f"shear/{name}"][start:end] = metacal_data[name]

    def make_mock_photometry(self, data):
        # The visit count affects the overall noise levels
        n_visit = self.config["visits_per_band"]
        # Do all the work in the function below
        photo = make_mock_photometry(n_visit, self.bands, data, self.config["unit_response"])

        for col in self.config["extra_cols"].split():
            photo[col] = data[col]

        return photo

    def make_mock_metadetect(self, data, photo):
        """
        !!! This function is not working properly yet!!!
        Generate a mock metadetect table.
        This is a long and complicated function unfortunately.

        Throughout we assume a single R = R11 = R22, with R12 = R21 = 0

        Parameters
        ----------
        data: dict
            Dictionary of arrays read from the input cosmo catalog
        photo: dict
            Dictionary of arrays generated to simulate photometry
        """

        # These are the numbers from figure F1 of the DES Y1 shear catalog paper
        # (this version is not yet public but is awaiting a second referee response)

        # Overall SNR for the three bands usually used for shape measurement
        # We use the true SNR not the estimated one, though these are pretty close
        snr = (photo["snr_r"] ** 2 + photo["snr_i"] ** 2 + photo["snr_z"] ** 2) ** 0.5
        snr_1p = (photo["snr_r_1p"] ** 2 + photo["snr_i_1p"] ** 2 + photo["snr_z_1p"] ** 2) ** 0.5
        snr_1m = (photo["snr_r_1m"] ** 2 + photo["snr_i_1m"] ** 2 + photo["snr_z_1m"] ** 2) ** 0.5
        snr_2p = (photo["snr_r_2p"] ** 2 + photo["snr_i_2p"] ** 2 + photo["snr_z_2p"] ** 2) ** 0.5
        snr_2m = (photo["snr_r_2m"] ** 2 + photo["snr_i_2m"] ** 2 + photo["snr_z_2m"] ** 2) ** 0.5

        if self.config["unit_response"]:
            assert np.allclose(snr, snr_1p)
            assert np.allclose(snr, snr_2m)

        nobj = snr.size

        log10_snr = np.log10(snr)

        # Convert from the half-light radius which is in the
        # input catalogs to a sigma.  Do this by pretending
        # that it is a Gaussian.  This is clearly wrong, and
        # if this causes major errors in the size cuts we may
        # have to modify this
        size_hlr = data["size_true"]
        size_sigma = size_hlr / np.sqrt(2 * np.log(2))
        size_T = 2 * size_sigma**2

        # Use a fixed PSF across all the objects
        psf_fwhm = 0.75
        psf_sigma = psf_fwhm / (2 * np.sqrt(2 * np.log(2)))
        psf_T = 2 * psf_sigma**2

        if self.config["unit_response"]:
            R = 1.0
            R_size = 0.0

        else:
            # Use the response model to get a reasonable response
            # value for this size and SNR
            R_mean = self.R_spline(log10_snr, size_T, grid=False)
            R_std = self.Rstd_spline(log10_snr, size_T, grid=False)

            # Assume a 0.2 correlation between the size response
            # and the shear response.
            rho = 0.2
            f = np.random.multivariate_normal([0.0, 0.0], [[1.0, rho], [rho, 1.0]], nobj).T
            R, R_size = f * R_std + R_mean

        # Convert magnitudes to fluxes according to the baseline
        # use in the metacal numbers
        flux_r = 10**0.4 * (27 - photo["mag_r"])
        flux_i = 10**0.4 * (27 - photo["mag_i"])
        flux_z = 10**0.4 * (27 - photo["mag_z"])

        # Note that this is delta_gamma not 2*delta_gamma, because
        # of how we use it below
        delta_gamma = 0.01

        # Use a fixed shape noise per component to generate
        # an overall
        if self.config["add_shape_noise"]:
            shape_noise = 0.26
        else:
            shape_noise = 0.0

        eps = np.random.normal(0, shape_noise, nobj) + 1.0j * np.random.normal(0, shape_noise, nobj)
        # True shears without shape noise
        g1 = data["shear_1"]/ (1.0 - data["convergence"]) 
        g2 = data["shear_2"]/ (1.0 - data["convergence"])

        if self.config["flip_g2"]:
            g2 *= -1

        # Do the full combination of (g,epsilon) -> e, not the approx one
        g = g1 + 1j * g2
        e = (eps + g) / (1 + g.conj() * eps)
        e1 = e.real
        e2 = e.imag
        g1 = e1
        g2 = e2
        zero = np.zeros(nobj)
        # Now collect together everything to go into the metacal
        # file
        output = {
            # Basic values
            "00/id": photo["id"],
            "1p/id": photo["id"],
            "1m/id": photo["id"],
            "2p/id": photo["id"],
            "2m/id": photo["id"],
            "00/ra": photo["ra"],
            "1p/ra": photo["ra"],
            "1m/ra": photo["ra"],
            "2p/ra": photo["ra"],
            "2m/ra": photo["ra"],
            "00/dec": photo["dec"],
            "1p/dec": photo["dec"],
            "1m/dec": photo["dec"],
            "2p/dec": photo["dec"],
            "2m/dec": photo["dec"],
            # Keep the truth values for use in some testing paths
            # We are not pretending to do meta
            "00/true_g1": g1,
            "00/true_g2": g2,
            "00/redshift_true": photo["redshift_true"],
            "1p/redshift_true": photo["redshift_true"],
            "1m/redshift_true": photo["redshift_true"],
            "2p/redshift_true": photo["redshift_true"],
            "2m/redshift_true": photo["redshift_true"],
            # g1
            "00/g1": e1 * R,
            "1p/g1": (e1 + delta_gamma) * R,
            "1m/g1": (e1 - delta_gamma) * R,
            "2p/g1": e1 * R,
            "2m/g1": e1 * R,
            # g2
            "00/g2": e2 * R,
            "1p/g2": e2 * R,
            "1m/g2": e2 * R,
            "2p/g2": (e2 + delta_gamma) * R,
            "2m/g2": (e2 - delta_gamma) * R,
            # T
            "00/T": size_T,
            "1p/T": size_T + R_size * delta_gamma,
            "1m/T": size_T - R_size * delta_gamma,
            "2p/T": size_T + R_size * delta_gamma,
            "2m/T": size_T - R_size * delta_gamma,
            # Terr
            "00/T_err": zero,
            "1p/T_err": zero,
            "1m/T_err": zero,
            "2p/T_err": zero,
            "2m/T_err": zero,
            # size
            "00/s2n": snr,
            "1p/s2n": snr_1p,
            "1m/s2n": snr_1m,
            "2p/s2n": snr_2p,
            "2m/s2n": snr_2m,
            # Magntiudes and fluxes, just copied from the inputs.
            "00/mag_r": photo["mag_r"],
            "00/mag_i": photo["mag_i"],
            "00/mag_z": photo["mag_z"],
            "00/mag_err_r": photo["mag_r_err"],
            "00/mag_err_i": photo["mag_i_err"],
            "00/mag_err_z": photo["mag_z_err"],
            "1p/mag_r": photo["mag_r_1p"],
            "2p/mag_r": photo["mag_r_2p"],
            "1m/mag_r": photo["mag_r_1m"],
            "2m/mag_r": photo["mag_r_2m"],
            "1p/mag_i": photo["mag_i_1p"],
            "2p/mag_i": photo["mag_i_2p"],
            "1m/mag_i": photo["mag_i_1m"],
            "2m/mag_i": photo["mag_i_2m"],
            "1p/mag_z": photo["mag_z_1p"],
            "2p/mag_z": photo["mag_z_2p"],
            "1m/mag_z": photo["mag_z_1m"],
            "2m/mag_z": photo["mag_z_2m"],
            "1p/mag_err_r": photo["mag_r_err"],
            "2p/mag_err_r": photo["mag_r_err"],
            "1m/mag_err_r": photo["mag_r_err"],
            "2m/mag_err_r": photo["mag_r_err"],
            "1p/mag_err_i": photo["mag_i_err"],
            "2p/mag_err_i": photo["mag_i_err"],
            "1m/mag_err_i": photo["mag_i_err"],
            "2m/mag_err_i": photo["mag_i_err"],
            "1p/mag_err_z": photo["mag_z_err"],
            "2p/mag_err_z": photo["mag_z_err"],
            "1m/mag_err_z": photo["mag_z_err"],
            "2m/mag_err_z": photo["mag_z_err"],
            # Fixed PSF parameters - all round with same size
            "00/psf_g1": zero,
            "1p/psf_g1": zero,
            "1m/psf_g1": zero,
            "2p/psf_g1": zero,
            "2m/psf_g1": zero,
            "00/psf_g2": zero,
            "1p/psf_g2": zero,
            "1m/psf_g2": zero,
            "2p/psf_g2": zero,
            "2m/psf_g2": zero,
            "00/psf_T_mean": np.repeat(psf_T, nobj),
            "1p/psf_T_mean": np.repeat(psf_T, nobj),
            "1m/psf_T_mean": np.repeat(psf_T, nobj),
            "2p/psf_T_mean": np.repeat(psf_T, nobj),
            "2m/psf_T_mean": np.repeat(psf_T, nobj),
            # Everything that gets this far should be used, so flag=0
            "00/flags": np.zeros(nobj, dtype=np.int32),
            "1p/flags": np.zeros(nobj, dtype=np.int32),
            "1m/flags": np.zeros(nobj, dtype=np.int32),
            "2p/flags": np.zeros(nobj, dtype=np.int32),
            "2m/flags": np.zeros(nobj, dtype=np.int32),
            # we use weights of one for everything for metacal
            # if that ever changes we may also need to add
            # weight_1p, etc.
            "00/weight": np.ones(nobj),
            "1p/weight": np.ones(nobj),
            "1m/weight": np.ones(nobj),
            "2p/weight": np.ones(nobj),
            "2m/weight": np.ones(nobj),
        }

        return output

    def make_mock_metacal(self, data, photo):
        """
        I copied the old version here to use if we do not want metadetect.
        Generate a mock metacal table.
        This is a long and complicated function unfortunately.
        Throughout we assume a single R = R11 = R22, with R12 = R21 = 0

        Parameters
        ----------
        data: dict
            Dictionary of arrays read from the input cosmo catalog
        photo: dict
            Dictionary of arrays generated to simulate photometry
        """

        # These are the numbers from figure F1 of the DES Y1 shear catalog paper
        # (this version is not yet public but is awaiting a second referee response)

        # Overall SNR for the three bands usually used for shape measurement
        # We use the true SNR not the estimated one, though these are pretty close
        snr = (photo["snr_r"] ** 2 + photo["snr_i"] ** 2 + photo["snr_z"] ** 2) ** 0.5
        snr_1p = (photo["snr_r_1p"] ** 2 + photo["snr_i_1p"] ** 2 + photo["snr_z_1p"] ** 2) ** 0.5
        snr_1m = (photo["snr_r_1m"] ** 2 + photo["snr_i_1m"] ** 2 + photo["snr_z_1m"] ** 2) ** 0.5
        snr_2p = (photo["snr_r_2p"] ** 2 + photo["snr_i_2p"] ** 2 + photo["snr_z_2p"] ** 2) ** 0.5
        snr_2m = (photo["snr_r_2m"] ** 2 + photo["snr_i_2m"] ** 2 + photo["snr_z_2m"] ** 2) ** 0.5

        if self.config["unit_response"]:
            assert np.allclose(snr, snr_1p)
            assert np.allclose(snr, snr_2m)

        nobj = snr.size

        log10_snr = np.log10(snr)

        # Convert from the half-light radius which is in the
        # input catalogs to a sigma.  Do this by pretending
        # that it is a Gaussian.  This is clearly wrong, and
        # if this causes major errors in the size cuts we may
        # have to modify this
        size_hlr = data["size_true"]
        size_sigma = size_hlr / np.sqrt(2 * np.log(2))
        size_T = 2 * size_sigma**2

        # Use a fixed PSF across all the objects
        psf_fwhm = 0.75
        psf_sigma = psf_fwhm / (2 * np.sqrt(2 * np.log(2)))
        psf_T = 2 * psf_sigma**2

        if self.config["unit_response"]:
            R = 1.0
            R_size = 0.0

        else:
            # Use the response model to get a reasonable response
            # value for this size and SNR
            R_mean = self.R_spline(log10_snr, size_T, grid=False)
            R_std = self.Rstd_spline(log10_snr, size_T, grid=False)

            # Assume a 0.2 correlation between the size response
            # and the shear response.
            rho = 0.2
            f = np.random.multivariate_normal([0.0, 0.0], [[1.0, rho], [rho, 1.0]], nobj).T
            R, R_size = f * R_std + R_mean

        # Convert magnitudes to fluxes according to the baseline
        # use in the metacal numbers
        flux_r = 10**0.4 * (27 - photo["mag_r"])
        flux_i = 10**0.4 * (27 - photo["mag_i"])
        flux_z = 10**0.4 * (27 - photo["mag_z"])

        # Note that this is delta_gamma not 2*delta_gamma, because
        # of how we use it below
        delta_gamma = 0.01

        # Use a fixed shape noise per component to generate
        # an overall
        if self.config["add_shape_noise"]:
            shape_noise = 0.26
        else:
            shape_noise = 0.0

        eps = np.random.normal(0, shape_noise, nobj) + 1.0j * np.random.normal(0, shape_noise, nobj)
        # True shears without shape noise
        g1 = data["shear_1"]
        g2 = data["shear_2"]

        if self.config["flip_g2"]:
            g2 *= -1

        # Do the full combination of (g,epsilon) -> e, not the approx one
        g = g1 + 1j * g2
        e = (eps + g) / (1 + g.conj() * eps)
        e1 = e.real
        e2 = e.imag

        zero = np.zeros(nobj)
        # Now collect together everything to go into the metacal
        # file
        output = {
            # Basic values
            "id": photo["id"],
            "ra": photo["ra"],
            "dec": photo["dec"],
            # Keep the truth value just in case
            "true_g1": g1,
            "true_g2": g2,
            # add true redshift since it is used in source selector
            "redshift_true": photo["redshift_true"],
            # g1
            "g1": e1 * R,
            "g1_1p": (e1 + delta_gamma) * R,
            "g1_1m": (e1 - delta_gamma) * R,
            "g1_2p": e1 * R,
            "g1_2m": e1 * R,
            # g2
            "g2": e2 * R,
            "g2_1p": e2 * R,
            "g2_1m": e2 * R,
            "g2_2p": (e2 + delta_gamma) * R,
            "g2_2m": (e2 - delta_gamma) * R,
            # T
            "T": size_T,
            "T_1p": size_T + R_size * delta_gamma,
            "T_1m": size_T - R_size * delta_gamma,
            "T_2p": size_T + R_size * delta_gamma,
            "T_2m": size_T - R_size * delta_gamma,
            # Terr
            "T_err": zero,
            "T_err_1p": zero,
            "T_err_1m": zero,
            "T_err_2p": zero,
            "T_err_2m": zero,
            # size
            "s2n": snr,
            "s2n_1p": snr_1p,
            "s2n_1m": snr_1m,
            "s2n_2p": snr_2p,
            "s2n_2m": snr_2m,
            # Magntiudes and fluxes, just copied from the inputs.
            "mag_r": photo["mag_r"],
            "mag_i": photo["mag_i"],
            "mag_z": photo["mag_z"],
            "mag_err_r": photo["mag_r_err"],
            "mag_err_i": photo["mag_i_err"],
            "mag_err_z": photo["mag_z_err"],
            "mag_r_1p": photo["mag_r_1p"],
            "mag_r_2p": photo["mag_r_2p"],
            "mag_r_1m": photo["mag_r_1m"],
            "mag_r_2m": photo["mag_r_2m"],
            "mag_i_1p": photo["mag_i_1p"],
            "mag_i_2p": photo["mag_i_2p"],
            "mag_i_1m": photo["mag_i_1m"],
            "mag_i_2m": photo["mag_i_2m"],
            "mag_z_1p": photo["mag_z_1p"],
            "mag_z_2p": photo["mag_z_2p"],
            "mag_z_1m": photo["mag_z_1m"],
            "mag_z_2m": photo["mag_z_2m"],
            "mag_err_r_1p": photo["mag_r_err"],
            "mag_err_r_2p": photo["mag_r_err"],
            "mag_err_r_1m": photo["mag_r_err"],
            "mag_err_r_2m": photo["mag_r_err"],
            "mag_err_i_1p": photo["mag_i_err"],
            "mag_err_i_2p": photo["mag_i_err"],
            "mag_err_i_1m": photo["mag_i_err"],
            "mag_err_i_2m": photo["mag_i_err"],
            "mag_err_z_1p": photo["mag_z_err"],
            "mag_err_z_2p": photo["mag_z_err"],
            "mag_err_z_1m": photo["mag_z_err"],
            "mag_err_z_2m": photo["mag_z_err"],
            # Fixed PSF parameters - all round with same size
            "psf_g1": zero,
            "psf_g2": zero,
            "psf_T_mean": np.repeat(psf_T, nobj),
            # Everything that gets this far should be used, so flag=0
            "flags": np.zeros(nobj, dtype=np.int32),
            "flags_1p": np.zeros(nobj, dtype=np.int32),
            "flags_1m": np.zeros(nobj, dtype=np.int32),
            "flags_2p": np.zeros(nobj, dtype=np.int32),
            "flags_2m": np.zeros(nobj, dtype=np.int32),
            # we use weights of one for everything for metacal
            # if that ever changes we may also need to add
            # weight_1p, etc.
            "weight": np.ones(nobj),
        }

        return output

    def apply_magnitude_cut(self, data):
        """
        Allow for a cut in absolute r and i magnitudes.
        """
        mag_r_limit = self.config["Mag_r_limit"]
        mag_i_limit = self.config["Mag_i_limit"]

        sel_r = data["Mag_true_r_sdss_z0"] < mag_r_limit
        sel_i = data["mag_i"] < mag_i_limit

        sel = sel_r & sel_i

        ndet = sel.sum()
        ntot = sel.size
        fract = ndet * 100.0 / ntot

        print(
            f"{ndet} objects pass magnitude cuts "
            f"(r < {mag_r_limit}, i < {mag_i_limit}) "
            f"out of {ntot} objects ({fract:.1f}%)"
        )

        # Remove all objects not selected
        for key in list(data.keys()):
            data[key] = data[key][sel]

    def remove_undetected(self, data, photo):
        """
        Strip out any undetected objects from the two
        simulated data sets.

        Use a configuration parameter snr_limit to decide
        on the detection limit.
        """
        snr_limit = self.config["snr_limit"]

        # This will become a boolean array in a minute when
        # we OR it with an array
        detected = False

        # Check if detected in any band.  Makes a boolean array
        # Even though we started with just a single False.
        for band in self.bands:
            detected_in_band = photo[f"snr_{band}"] > snr_limit
            not_detected_in_band = ~detected_in_band
            # Set objects not detected in one band that are detected in another
            # to inf magnitude in that band, and the SNR to zero.
            # We have to do this for each of the variants also, because otherwise
            # we end up with wildly different final SNR values later.
            # This is the metadetection issue really!
            for v in metacal_variants(f"snr_{band}"):
                photo[v][not_detected_in_band] = 0.0
            for v in metacal_variants(f"mag_{band}"):
                photo[v][not_detected_in_band] = np.inf

            # Record that we have detected this object at all
            detected |= detected_in_band

        # the protoDC2 sims have an edge with zero shear.
        # Remove it.
        zero_shear_edge = (abs(data["shear_1"]) == 0) & (abs(data["shear_2"]) == 0)
        print("Removing {} objects with identically zero shear in both terms".format(zero_shear_edge.sum()))

        detected &= ~zero_shear_edge

        # Print out interesting information
        ndet = detected.sum()
        ntot = detected.size
        fract = ndet * 100.0 / ntot
        print(f"Detected {ndet} out of {ntot} objects ({fract:.1f}%)")

        # Remove all objects not detected in *any* band
        # make a copy of the keys with list(photo.keys()) so we are not
        # modifying during the iteration
        for key in list(photo.keys()):
            photo[key] = photo[key][detected]

        for key in list(data.keys()):
            data[key] = data[key][detected]




[docs]
class TXBuzzardMock(TXCosmoDC2Mock):
    """
    Simulate mock photometry from Buzzard.

    May be obsolete.
    """

    name = "TXBuzzardMock"
    parallel = False
    inputs = [("response_model", HDFFile)]

    outputs = [
        ("shear_catalog", ShearCatalog),
        ("photometry_catalog", HDFFile),
    ]

    config_options = {
        "cat_name": StageParameter(str, "buzzard", msg="Name of the mock catalog to use."),
        "visits_per_band": StageParameter(int, 165, msg="Number of visits per band for noise simulation."),
        "snr_limit": StageParameter(float, 4.0, msg="S/N limit for object selection."),
        "max_size": StageParameter(int, 99999999999999, msg="Maximum catalog size for testing."),
        "extra_cols": StageParameter(str, "", msg="Extra columns to include (space-separated)."),
        "max_npix": StageParameter(int, 99999999999999, msg="Maximum number of pixels."),
        "unit_response": StageParameter(bool, False, msg="Whether to use unit response in simulation."),
        "flip_g2": StageParameter(bool, True, msg="Whether to flip g2 sign to match conventions."),
    }




[docs]
class TXGaussianSimsMock(TXCosmoDC2Mock):
    """
    Simulate mock photometry from gaussian simulations

    This stage simulates metacal data and metacalibrated
    photometry measurements, starting from simple Gaussian simulations
    produced starting from CCL power spectra and poission sampling galaxies
    from it.

    This is mainly useful for testing infrastructure
    starting from a purer simulation.
    """

    name = "TXGaussianSimsMock"
    parallel = False
    inputs = [("response_model", HDFFile)]

    outputs = [
        ("shear_catalog", ShearCatalog),
        ("photometry_catalog", HDFFile),
    ]

    config_options = {
        "cat_name": StageParameter(str, "GaussianSims", msg="Name of the Gaussian simulation catalog."),
        "visits_per_band": StageParameter(int, 165, msg="Number of visits per band for noise simulation."),
        "snr_limit": StageParameter(float, 0.0, msg="S/N limit for object selection (0 for all)."),
        "max_size": StageParameter(int, 99999999999999, msg="Maximum catalog size for testing."),
        "extra_cols": StageParameter(str, "", msg="Extra columns to include (space-separated)."),
        "max_npix": StageParameter(int, 99999999999999, msg="Maximum number of pixels."),
        "unit_response": StageParameter(bool, True, msg="Whether to use unit response in simulation."),
        "cat_size": StageParameter(int, 0, msg="Catalog size (0 for all)."),
        "flip_g2": StageParameter(bool, False, msg="Whether to flip g2 sign to match conventions."),
        "apply_mag_cut": StageParameter(bool, False, msg="Apply magnitude cut for descqa comparison."),
        "apply_i_mag_cut": StageParameter(bool, False, msg="Apply observed i-band magnitude cut."),
        "Mag_i_limit": StageParameter(float, 24.25, msg="Observed i-band magnitude limit for object selection."),
        "metadetect": StageParameter(
            bool, True, msg="Whether to make a metadetect-style catalog (True) or metacal (False)."
        ),
        "add_shape_noise": StageParameter(bool, False, msg="Whether to add shape noise to simulation."),
    }

    def data_iterator(self, cat):
        # all cols we need
        (
            ra,
            dec,
            g1,
            g2,
            z,
            m_u,
            m_g,
            m_r,
            m_i,
            m_z,
            m_y,
            etrue1,
            etrue2,
            size,
            galaxy_id,
        ) = cat

        # figuring out number of chunks
        chunk_size = 1_000_000
        nchunk = len(ra) // chunk_size
        if nchunk * chunk_size < len(ra):
            nchunk += 1

        # main loop
        for i in range(nchunk):
            # start and end index of this chunk in full data
            s = chunk_size * i
            e = s + chunk_size

            # make dict just for this chunk of data
            data_chunk = {}
            data_chunk["ra"] = ra[s:e]
            data_chunk["dec"] = dec[s:e]
            data_chunk["shear_1"] = g1[s:e]
            data_chunk["shear_2"] = g2[s:e]
            data_chunk["size_true"] = size[s:e]
            data_chunk["galaxy_id"] = galaxy_id[s:e]
            data_chunk["redshift_true"] = z[s:e]
            data_chunk["ellipticity_1_true"] = etrue1[s:e]
            data_chunk["ellipticity_2_true"] = etrue2[s:e]
            data_chunk["mag_true_u_lsst"] = m_u[s:e]
            data_chunk["mag_true_g_lsst"] = m_g[s:e]
            data_chunk["mag_true_r_lsst"] = m_r[s:e]
            data_chunk["mag_true_i_lsst"] = m_i[s:e]
            data_chunk["mag_true_z_lsst"] = m_z[s:e]
            data_chunk["mag_true_y_lsst"] = m_y[s:e]

            # send this chunk of data back to caller
            yield data_chunk

    def load_catalog(self):
        """
        This method loads the Gaussian sims produced in this notebook
        https://github.com/LSSTDESC/txpipe-cosmodc2/blob/master/notebooks/MaskAreaGaussianSims_and_FakeTXPipeInput.ipynb
        """
        cat_name = self.config["cat_name"]
        self.bands = ("u", "g", "r", "i", "z", "y")

        print(f"Loading from catalog {cat_name}")

        cat = np.load(cat_name, allow_pickle=True)

        N = len(cat[0])

        return cat, N



def make_mock_photometry(n_visit, bands, data, unit_response):
    """
    Generate a mock photometric table with noise added

    This is mostly from LSE‐40 by
    Zeljko Ivezic, Lynne Jones, and Robert Lupton
    retrieved here:
    http://faculty.washington.edu/ivezic/Teaching/Astr511/LSST_SNRdoc.pdf
    """

    output = {}
    nobj = data["galaxy_id"].size
    output["ra"] = data["ra"]
    output["dec"] = data["dec"]
    output["id"] = data["galaxy_id"]
    output["extendedness"] = np.ones(nobj)
    if "redshift_true" in data:
        output["redshift_true"] = data["redshift_true"]
    # Sky background, seeing FWHM, and system throughput,
    # all from table 2 of Ivezic, Jones, & Lupton
    B_b = np.array([85.07, 467.9, 1085.2, 1800.3, 2775.7, 3614.3])
    fwhm = np.array([0.77, 0.73, 0.70, 0.67, 0.65, 0.63])
    T_b = np.array([0.0379, 0.1493, 0.1386, 0.1198, 0.0838, 0.0413])

    # effective pixels size for a Gaussian PSF, from equation
    # 27 of Ivezic, Jones, & Lupton
    pixel = 0.2  # arcsec
    N_eff = 2.436 * (fwhm / pixel) ** 2

    # other numbers from Ivezic, Jones, & Lupton
    sigma_inst2 = 10.0**2  # instrumental noise in photons per pixel, just below eq 42
    gain = 1  # ADU units per photon, also just below eq 42
    D = 6.5  # primary mirror diameter in meters, from LSST key numbers page (effective clear diameter)
    # Not sure what effective clear diameter means but it's closer to the number used in the paper
    time = 30.0  # seconds per exposure, from LSST key numbers page
    sigma_b2 = 0.0  # error on background, just above eq 42

    # combination of these  used below, from various equations
    factor = 5455.0 / gain * (D / 6.5) ** 2 * (time / 30.0)

    # Fake some metacal responses
    if unit_response:
        mag_responses = [0.0 for i in bands]
    else:
        mag_responses = generate_mock_metacal_mag_responses(bands, nobj)

    delta_gamma = 0.01  # this is the half-delta gamma, i.e. gamma_+ - gamma_0
    # that's the right thing to use here because we are doing m+ = m0 + dm/dy*dy
    # Use the same response for gamma1 and gamma2

    for band, b_b, t_b, n_eff, mag_resp in zip(bands, B_b, T_b, N_eff, mag_responses):
        # truth magnitude
        mag = data[f"mag_true_{band}_lsst"]
        output[f"mag_true_{band}_lsst"] = mag

        # expected signal photons, over all visits
        c_b = factor * 10 ** (0.4 * (25 - mag)) * t_b * n_visit

        # expected background photons, over all visits
        background = np.sqrt((b_b + sigma_inst2 + sigma_b2) * n_eff * n_visit)
        # total expected photons
        mu = c_b + background

        # Observed number of photons in excess of the expected background.
        # This can go negative for faint magnitudes, indicating that the object is
        # not going to be detected
        n_obs = np.random.poisson(mu) - background
        n_obs_err = np.sqrt(mu)

        # signal to noise, true and estimated values
        true_snr = c_b / background
        obs_snr = n_obs / background

        # observed magnitude from inverting c_b expression above
        mag_obs = 25 - 2.5 * np.log10(n_obs / factor / t_b / n_visit)

        # converting error on n_obs to error on mag
        mag_err = 2.5 / np.log(10.0) / obs_snr

        output[f"true_snr_{band}"] = true_snr
        output[f"snr_{band}"] = obs_snr
        output[f"mag_{band}"] = mag_obs
        output[f"mag_err_{band}"] = mag_err
        output[f"mag_{band}_err"] = mag_err

        m = mag_resp * delta_gamma

        m1 = mag_obs + m
        m2 = mag_obs - m
        mag_obs_1p = m1
        mag_obs_1m = m2

        output[f"mag_{band}_1p"] = m1
        output[f"mag_{band}_1m"] = m2
        output[f"mag_{band}_2p"] = m1
        output[f"mag_{band}_2m"] = m2

        # Scale the SNR values according the to change in magnitude.r
        s = np.power(10.0, -0.4 * m)

        snr1 = obs_snr * s
        snr2 = obs_snr / s
        output[f"snr_{band}_1p"] = snr1
        output[f"snr_{band}_1m"] = snr2
        output[f"snr_{band}_2p"] = snr1
        output[f"snr_{band}_2m"] = snr2

    return output


def generate_mock_metacal_mag_responses(bands, nobj):
    nband = len(bands)
    mu = np.zeros(nband)  # seems approx mean of response across bands, from HSC tract
    rho = 0.25  #  approx correlation between response in bands, from HSC tract
    sigma2 = 1.7**2  # approx variance of response, from HSC tract
    covmat = np.full((nband, nband), rho * sigma2)
    np.fill_diagonal(covmat, sigma2)
    mag_responses = np.random.multivariate_normal(mu, covmat, nobj).T
    return mag_responses



[docs]
class TXSimpleMock(PipelineStage):
    """
    Load an ascii astropy table and put it in shear catalog format.
    """

    name = "TXSimpleMock"
    parallel = False
    inputs = [("mock_shear_catalog", TextFile)]
    outputs = [("shear_catalog", ShearCatalog)]
    config_options = {}

    def run(self):
        from astropy.table import Table
        import numpy as np

        # Load the data. We are assuming here it is small enough to fit in memory
        input_filename = self.get_input("mock_shear_catalog")
        input_data = Table.read(input_filename, format="ascii")
        n = len(input_data)

        data = {}
        # required columns
        for col in ["ra", "dec", "g1", "g2", "s2n", "T"]:
            data[col] = input_data[col]

        # It's most likely we will have a redshift column.
        # Check for both that and "redshift_true"
        if "redshift" in input_data.colnames:
            data["redshift_true"] = input_data["redshift"]
        elif "redshift_true" in input_data.colnames:
            data["redshift_true"] = input_data["redshift_true"]

        # If there is an ID column then use it, but otherwise just use
        # sequential IDs
        if "id" in input_data.colnames:
            data["galaxy_id"] = input_data["id"]
        else:
            data["galaxy_id"] = np.arange(len(input_data))

        # if these catalogs are not present then we fake them.
        defaults = {
            "T_err": 0.0,
            "psf_g1": 0.0,
            "psf_g2": 0.0,
            "psf_T_mean": 0.202,  # this corresponds to a FWHM of 0.75 arcsec
            "weight": 1.0,
            "flags": 0,
        }

        for key, value in defaults.items():
            if key in input_data.colnames:
                data[key] = input_data[key]
            else:
                data[key] = np.full(n, value)

        self.save_catalog(data)

    def save_catalog(self, data):
        with self.open_output("shear_catalog") as f:
            g = f.create_group("shear")
            g.attrs["catalog_type"] = "simple"
            for key, value in data.items():
                g.create_dataset(key, data=value)




[docs]
class TXMockTruthPZ(PipelineStage):
    name = "TXMockTruthPZ"
    parallel = False
    inputs = [("shear_catalog", ShearCatalog)]
    outputs = [("photoz_pdfs", QPPDFFile)]
    config_options = {
        "mock_sigma_z": StageParameter(float, 0.001, msg="Sigma_z for mock photo-z PDF generation."),
    }

    def run(self):
        import qp
        import numpy as np

        sigma_z = self.config["mock_sigma_z"]

        # read the input truth redshifts
        with self.open_input("shear_catalog", wrapper=True) as f:
            group = f.file[f.get_primary_catalog_group()]
            n = group["ra"].size
            redshifts = group["redshift_true"][:]

        zgrid = np.linspace(0, 3, 301)
        pdfs = np.zeros((n, len(zgrid)))

        spread_z = sigma_z * (1 + redshifts)
        # make a gaussian PDF for each object
        delta = zgrid[np.newaxis, :] - redshifts[:, np.newaxis]
        pdfs = np.exp(-0.5 * (delta / spread_z[:, np.newaxis]) ** 2) / np.sqrt(2 * np.pi) / spread_z[:, np.newaxis]

        q = qp.Ensemble(qp.interp, data=dict(xvals=zgrid, yvals=pdfs))
        q.set_ancil(dict(zmode=redshifts, zmean=redshifts, zmedian=redshifts))
        q.write_to(self.get_output("photoz_pdfs"))



def test():
    import pylab

    data = {
        "ra": None,
        "dec": None,
        "galaxy_id": None,
    }
    bands = "ugrizy"
    n_visit = 165
    M5 = [24.22, 25.17, 24.74, 24.38, 23.80]
    for b, m5 in zip(bands, M5):
        data[f"mag_true_{b}_lsst"] = np.repeat(m5, 10000)
    results = make_mock_photometry(n_visit, bands, data, True)
    pylab.hist(results["snr_r"], bins=50, histtype="step")
    pylab.savefig("snr_r.png")