Source code for txpipe.lssweights

from .base_stage import PipelineStage
from .map_correlations import TXMapCorrelations
from .utils import choose_pixelization
from .data_types import (
    HDFFile,
    MapsFile,
    FileCollection,
    FiducialCosmology,
    TomographyCatalog,
    QPNOfZFile,
)
import numpy as np
import glob
import time
from .utils.theory import theory_3x2pt
from .utils.fitting import calc_chi2
from ceci.config import StageParameter


class TXLSSDensityBase(TXMapCorrelations):
    """
    Base class for LSS systematics

    Not to be run directly. This is an abstract base class,
    which other subclasses inherit from
    """

    name = "TXLSSDensityBase"
    parallel = False

    config_options = {
        "supreme_path_root": StageParameter(str, "", msg="Root path for supreme files."),
        "nbin": StageParameter(int, 20, msg="Number of tomographic bins."),
        "outlier_fraction": StageParameter(float, 0.01, msg="Fraction of outliers to exclude."),
        "allow_weighted_input": StageParameter(bool, False, msg="Allow weighted input catalogs."),
        "nside_coverage": StageParameter(int, 32, msg="HEALPix nside for coverage maps."),
        "fill_missing_pix": StageParameter(
            bool,
            False,
            msg="If True will fill any pixels that are in the mask but dont have SP maps with teh mean SP map value (use with care)",
        ),
        "equal_area_bins": StageParameter(
            bool, True, msg="Use equal area bins in 1d correlations, instead of equal spacing"
        ),
    }

    def read_healsparse(self, map_path, nside):
        """
        returns a healsparse object degraded to nside
        """
        import healsparse
        import os

        # if this is a symlink, use the real path to access the file
        if os.path.islink(map_path):
            real_map_path = os.path.realpath(map_path)
            print(f"{map_path} is a link, we will use the real path {real_map_path}")
        else:
            real_map_path = map_path

        # Convert to correct res healpix map
        m = healsparse.HealSparseMap.read(map_path)
        return m.degrade(nside)

    def prepare_sys_maps(self):
        """
        For this method we need sys maps to be normalized to mean 0
        """

        self.verify_config()

        sys_maps, sys_names = self.load_and_mask_sysmaps()

        # normalize sysmaps (and keep track of the normalization factors)
        mean, std = self.normalize_sysmaps(sys_maps)
        sys_meta = {"masked": True, "normed": True, "mean": mean, "std": std}

        for imap, sys_map in enumerate(sys_maps):
            sys_meta[f"edges_{imap}"] = self.compute_bin_edges(sys_map)

        return sys_maps, sys_names, sys_meta

    @staticmethod
    def normalize_sysmaps(sys_maps):
        """
        Normalize a list of healsparse maps to mean=0, std=1
        """

        means = []
        stds = []
        for sys_map in sys_maps:
            vpix = sys_map.valid_pixels
            sys_vals = sys_map[vpix]
            mean = np.mean(sys_vals)
            std = np.std(sys_vals)
            sys_map.update_values_pix(vpix, (sys_vals - mean) / std)
            means.append(mean)
            stds.append(std)

        return np.array(means), np.array(stds)  # return means and stds to help reconstruct the original maps later

    def get_deltag(self, tomobin):
        """
        convert the ra, dec of the lens sample to pixel number counts
        """
        import healpy as hp
        import healsparse as hsp

        nside = self.config["nside"]
        
        with self.open_input("mask", wrapper=True) as map_file:
            mask_map_info = map_file.read_map_info("mask")
            mask = map_file.read_mask("mask", degrade_nside=self.config["nside"])

        maskpix = mask.valid_pixels

        # TO DO: Parallelize this
        # load the ra and dec of this lens bins
        with self.open_input("binned_lens_catalog_unweighted", wrapper=False) as f:
            ra = f[f"lens/bin_{tomobin}/ra"][:]
            dec = f[f"lens/bin_{tomobin}/dec"][:]
            weight = f[f"lens/bin_{tomobin}/weight"][:]

        # pixel ID for each lens galaxy
        obj_pix = hp.ang2pix(nside, ra, dec, lonlat=True, nest=True)

        #this is not healsparsed yet
        Ncounts_bincount = np.bincount(obj_pix, weights=weight)
        pixels_bincount = np.arange(len(Ncounts_bincount))

        #only select the occupied pixels
        pixel = pixels_bincount[Ncounts_bincount != 0.0]
        Ncounts = Ncounts_bincount[Ncounts_bincount != 0.0]

        deltag = hsp.HealSparseMap.make_empty(self.config["nside_coverage"], nside, dtype=np.float64)
        deltag.update_values_pix(maskpix, 0.0) #initially set all valid pixels to 0
        deltag.update_values_pix(pixel, Ncounts) #then set the occupied values to their ncount
        n = deltag[maskpix] / mask[maskpix]
        nmean = np.average(n, weights=mask[maskpix])
        deltag.update_values_pix(maskpix, n / nmean - 1.0)  # overdenity map

        return deltag, mask

    def load_and_mask_sysmaps(self):
        """
        load the SP maps and mask them
        """
        import healpy as hp
        import healsparse as hsp

        s = time.time()

        root = self.config["supreme_path_root"]
        sys_files = glob.glob(f"{root}*.hs")
        sys_files = np.sort(sys_files)
        nsys = len(sys_files)
        print(f"Found {nsys} total systematic maps")

        with self.open_input("mask", wrapper=True) as map_file:
            # load *boolean* mask
            mask = map_file.read_mask(
                "mask", degrade_nside=self.config["nside"], returnbool=True
            )

        nside = mask.nside_sparse

        sys_maps = []
        sys_names = []
        for i, map_path in enumerate(sys_files):
            # strip root, .hs, and underscores to get friendly name
            sys_name = map_path[len(root) : -3].strip("_")
            sys_names.append(sys_name)

            # get actual data for this map
            sys_map = self.read_healsparse(map_path, nside)

            # insist all "good" pixels in the mask have an SP map value for this map
            n_missing = np.sum(sys_map[mask.valid_pixels] == hp.UNSEEN)
            if self.config["fill_missing_pix"]:
                if n_missing != 0:
                    print(f"Found {n_missing}/{mask.n_valid} mask pixels with missing {sys_name} values", end="")
                    missing_pix = mask.valid_pixels[sys_map[mask.valid_pixels] == hp.UNSEEN]
                    mean_sys = np.mean(sys_map.get_values_pix(sys_map.valid_pixels))
                    print(f", filling with mean value {mean_sys}")
                    sys_map.update_values_pix(missing_pix, mean_sys)
            else:
                assert n_missing == 0, (
                    f"Found {n_missing}/{mask.n_valid} mask pixels with missing {sys_name} values, and fill_missing_pix was False"
                )

            # mask is boolean so this should return a map with just the valid sys_map pixels
            sys_map = hsp.operations.product_intersection([sys_map, mask])
            sys_maps.append(sys_map)

        f = time.time()
        print("load_and_mask_sp_maps took {0}s".format(f - s))

        return np.array(sys_maps), np.array(sys_names)

    def calculate_1d_density_correlations(self, tomobin, mean_density_map=None):
        """
        compute the binned 1d density correlations for a single tomographic lens bin

        Parameters
        ----------
        tomobin: Integer
            index for the tomographic lens bin
        mean_density_map: healsparse map (None)
            map of inverse weight to be applied to the galaxies
            if None, will use unweighted lens catalog

        Returns
        -------
        density_corrs: lsstools.DensityCorrelation
            DensityCorrelation instance containing the number
            counts/density vs sysmap for all sysmaps
        """
        import healpy as hp
        from . import lsstools

        s = time.time()

        with self.open_input("mask", wrapper=True) as map_file:
            mask_map_info = map_file.read_map_info("mask")
            mask = map_file.read_map("mask")
        nside = mask_map_info["nside"]
        nest = mask_map_info["nest"]

        # load the ra and dec of this lens bins
        with self.open_input("binned_lens_catalog_unweighted", wrapper=False) as f:
            ra = f[f"lens/bin_{tomobin}/ra"][:]
            dec = f[f"lens/bin_{tomobin}/dec"][:]

            # pixel ID for each lens galaxy
            obj_pix = hp.ang2pix(nside, ra, dec, lonlat=True, nest=True)

            weight = f[f"lens/bin_{tomobin}/weight"][:]  # input object weights

            if self.config["allow_weighted_input"] == False:
                assert (weight == 1.0).all()  # Lets assume the input weights have to be 1 by default
            else:
                if (weight == 1.0).all() == False:
                    print("WARNING: Your input lens catalog has weights != 1")
                    print("You have set allow_weighted_input==True so I will allow this")
                    print("Output weight=input_weight*sys_weight")

            if mean_density_map is not None:
                weight = weight * 1.0 / mean_density_map[obj_pix]

        density_corrs = lsstools.DensityCorrelation(tomobin=tomobin)  # keeps track of the 1d plots
        for imap, sys_map in enumerate(self.sys_maps):
            sys_vals = sys_map[sys_map.valid_pixels]  # SP value in each valid pixel
            sys_obj = sys_map[obj_pix]  # SP value for each object in catalog

            if nest:  # ideally we dont want if statements like this....
                frac = mask[sys_map.valid_pixels]
            else:
                frac = mask[hp.nest2ring(nside, sys_map.valid_pixels)]

            sys_name = None if self.sys_names is None else self.sys_names[imap]

            edges = self.sys_meta[f"edges_{imap}"]

            density_corrs.add_correlation(
                imap,
                edges,
                sys_vals,
                sys_obj,
                frac=frac,
                weight=weight,
                sys_name=sys_name,
            )

            # also precompute the SP bined arrays and pixel counts
            density_corrs.precompute(imap, edges, sys_vals, frac=frac)

        density_corrs.sys_meta.update(self.sys_meta)

        f = time.time()
        print("calculate_1d_density_correlations took {0}s".format(f - s))

        return density_corrs

    def compute_bin_edges(self, sys_map):
        import scipy.stats

        nsysbins = self.config[
            "nbin"
        ]  # nominal number of SP bins (this can change if too many pixels have the same value)
        f = 0.5 * self.config["outlier_fraction"]
        percentiles = np.linspace(f, 1 - f, nsysbins + 1)

        sys_vals = sys_map[sys_map.valid_pixels]  # SP value in each valid pixel

        if self.config["equal_area_bins"]:
            edges = scipy.stats.mstats.mquantiles(sys_vals, percentiles)
        else:
            edges = np.linspace(
                np.percentile(sys_vals, 100.0 * percentiles[0]),
                np.percentile(sys_vals, 100.0 * percentiles[-1]),
                nsysbins + 1,
            )

        # Remove any duplciates from the edges array
        # This can occur when a large fraction of the survey has the same value in the SP map
        u_edges, idx = self.unique_tol(
            edges, tol=10 * np.finfo(edges.dtype).eps * np.abs(edges).max(), return_index=True
        )
        if len(u_edges) != len(edges):
            edges = edges[np.sort(idx)]

        # remove empty bins
        counts, _ = np.histogram(sys_vals, bins=edges)
        nonempty = counts > 0
        keep = np.concatenate(([True], nonempty))  # always keep 1st and last bin edge
        edges = edges[keep]

        return edges

    @staticmethod
    def unique_tol(arr, tol=1e-8, return_index=False):
        """
        Simple replacement for np.unique that allows for a small tolerance
        Useful when SP map values differ only by machine precision
        """
        out = []
        idx = []
        for i, x in enumerate(arr):
            if not any(np.isclose(x, y, atol=tol, rtol=0) for y in out):
                out.append(x)
                idx.append(i)
        out = np.array(out, dtype=arr.dtype)
        idx = np.array(idx, dtype=int)

        if return_index:
            return out, idx
        return out


class TXLSSDensityNullTests(TXLSSDensityBase):
    """
    Compute density correlations between lens sample density and survey property maps
    and its covariance
    """

    name = "TXLSSDensityNullTests"
    parallel = False
    inputs = [
        (
            "binned_lens_catalog_unweighted",
            TomographyCatalog,
        ),  # this file is used by the stage to compute density correlations
        ("mask", MapsFile),
        ("lens_photoz_stack", QPNOfZFile),  # Photoz stack (need if using theory curve in covariance)
        (
            "fiducial_cosmology",
            FiducialCosmology,
        ),  # Needed for the sample variance term if using full covariance calculation
    ]

    outputs = [
        ("lss_density_plots", FileCollection),  # output files and summary statistics will go here
        ("unweighted_density_correlation", HDFFile),
    ]

    config_options = {
        **TXLSSDensityBase.config_options,
        "simple_cov": StageParameter(
            bool, False, msg="if True will use a diagonal shot noise only covariance for the 1d relations"
        ),
        "diag_blocks_only": StageParameter(
            bool,
            True,
            msg="If True, will compute only the diagonal blocks of the 1D covariance matrix (no correlation between SP maps)",
        ),
        "b0": StageParameter(list, [1.0], msg="Galaxy bias values."),
    }

    def run(self):
        """
        Run the stage

        Steps
        (1) Prepare survey properties (SP maps) load, degrade, normalize, etc
        (2) Compute 1d density trends Ngal vs SP
        (3) Compute the covariance matrix of the density trends
        (4) Summarize (save plots, data points and covariance, etc)

        Each step can be overwritten in sub-classes
        """
        import sacc
        import healsparse

        pixel_scheme = choose_pixelization(**self.config)
        self.pixel_metadata = pixel_scheme.metadata
        self.pixel_metadata["nest"] = True #This stage uses healsparse maps which always use nested ordering

        # check the metadata nside matches the mask (might not be true if you use an external mask)
        with self.open_input("mask", wrapper=True) as map_file:
            mask_nside = map_file.read_map_info("mask")["nside"]
        assert self.pixel_metadata["nside"] == mask_nside

        # get number of tomographic lens bins
        with self.open_input("binned_lens_catalog_unweighted", wrapper=False) as f:
            self.Ntomo = f["lens"].attrs["nbin_lens"]

        # output directory for the plots and summary stats
        output_dir = self.open_output("lss_density_plots", wrapper=True)

        # open outdir density correlation file
        dens_output = self.open_output("unweighted_density_correlation", wrapper=False)

        # load the SP maps, apply the mask, normalize the maps (as needed by the method)
        self.sys_maps, self.sys_names, self.sys_meta = self.prepare_sys_maps()

        for ibin in range(self.Ntomo):
            # compute density vs SP map data vector
            density_corrs = self.calculate_1d_density_correlations(ibin)

            # compute covariance of data vector and add to DensityCorrelation object
            self.calc_covariance(density_corrs)

            # Add the null model n_dens/<n_dens> = 1 to the
            # DensityCorrelation object and compute its chi2
            density_corrs.add_null_model()

            # make summary stats and plots
            self.summarize_density(output_dir, dens_output, density_corrs)
        dens_output.close()

    def summarize_density(self, output_dir, dens_output, density_correlation):
        """
        make 1d density plots and other summary statistics and save them

        Parameters
        ----------
        output_dir: string

        density_correlation: lsstools.DensityCorrelation
        """
        import scipy.stats

        ibin = density_correlation.tomobin

        # save the 1D density trends
        # tomo bin label is taken from density_correlation
        density_correlation.save_to_group(dens_output)

        # plot 1d density trends
        for imap in np.unique(density_correlation.map_index):
            try:
                splabel = density_correlation.mapnames[imap]
            except KeyError:
                splabel = imap
            filepath = output_dir.path_for_file(f"sys1D_lens{ibin}_SP{splabel}.png")
            density_correlation.plot1d_singlemap(
                filepath,
                imap,
                plot_hist=True,
            )

        filepath = output_dir.path_for_file(f"chi2_hist_lens{ibin}.png")
        density_correlation.plot_chi2_hist(filepath, chi2_threshold=None)

    def calc_covariance(self, density_correlation):
        """
        Construct the covariance matrix of the ndens vs SP data-vector

        Parameters
        ----------
        density_correlation: lsstools.DensityCorrelation

        """
        s = time.time()

        # add diagonal shot noise
        density_correlation.add_diagonal_shot_noise_covariance(assert_empty=True)

        if self.config["simple_cov"] == False:
            if self.config["diag_blocks_only"] == False:
                # add off-diagonal shot noise
                cov_shot_noise_full = self.calc_covariance_shot_noise_offdiag(density_correlation, self.sys_maps)
                density_correlation.add_external_covariance(cov_shot_noise_full, assert_empty=False)

            # add clustering Sample Variance
            cov_sample_variance_full = self.calc_covariance_sample_variance(
                density_correlation,
                self.sys_maps,
                diag_blocks_only=self.config["diag_blocks_only"],
            )
            density_correlation.add_external_covariance(cov_sample_variance_full, assert_empty=False)

        f = time.time()
        print("calc_covariance took {0}s".format(f - s))

        return

    def calc_covariance_shot_noise_offdiag(self, density_correlation, sys_maps):
        """
        Shot noise-only covariance (off-diagonal blocks only)
        Does not include sample variance terms
        Off-diagonal terms are between different SP maps
        see https://github.com/elvinpoole/1dcov/blob/main/notes/1d_covariance_notes.pdf

        Parameters
        ----------

        density_correlation: lsstools.DensityCorrelation

        sys_maps: array of healsparse maps

        """
        import healpy as hp

        # get nside from the mask
        with self.open_input("mask", wrapper=True) as map_file:
            nside = map_file.read_map_info("mask")["nside"]

        # load the galaxy ra and dec of this lens bins
        # TODO: load lens sample in chunks if needed for memory
        tomobin = density_correlation.tomobin
        with self.open_input("binned_lens_catalog_unweighted", wrapper=False) as f:
            ra = f[f"lens/bin_{tomobin}/ra"][:]
            dec = f[f"lens/bin_{tomobin}/dec"][:]
            weight = f[f"lens/bin_{tomobin}/weight"][:]

        # pixel ID for each lens galaxy
        obj_pix = hp.ang2pix(nside, ra, dec, lonlat=True, nest=True)

        # Covariance matrix on number *counts*
        nbinstotal = len(density_correlation.map_index)
        covmat_N = np.zeros((nbinstotal, nbinstotal))

        # loop over each pair of maps to get 2d histograms
        # and fill in the Nobj covarianace matrix blocks
        map_list = np.unique(density_correlation.map_index).astype("int")
        n2d = {}
        for imap in map_list:
            sys_map_i = sys_maps[imap]
            sys_obj_i = sys_map_i[obj_pix]
            edgesi = density_correlation.get_edges(imap)
            maski = np.where(density_correlation.map_index.astype("int") == imap)[0]
            starti = maski[0]
            finishi = maski[-1] + 1
            for jmap in map_list:
                if jmap <= imap:
                    continue
                sys_map_j = sys_maps[jmap]
                sys_obj_j = sys_map_j[obj_pix]
                edgesj = density_correlation.get_edges(jmap)
                maskj = np.where(density_correlation.map_index.astype("int") == jmap)[0]
                startj = maskj[0]
                finishj = maskj[-1] + 1

                n2d_pair, _, _ = np.histogram2d(sys_obj_i, sys_obj_j, bins=(edgesi, edgesj), weights=weight)
                n2d[imap, jmap] = n2d_pair

                covmat_N[starti:finishi, startj:finishj] = n2d_pair
                covmat_N[startj:finishj, starti:finishi] = n2d_pair.T

        # convert N (number count) covariance into n (normalized number density) covariance
        # cov(n1,n2) = cov(N1,N2)*norm**2/(Npix1*Npix2)
        # TODO: replace np.matrix with @ operator
        npix1npix2 = np.matrix(density_correlation.npix).T * np.matrix(density_correlation.npix)
        norm2 = np.matrix(density_correlation.norm).T * np.matrix(density_correlation.norm)
        covmat_ndens = covmat_N * np.array(norm2) / np.array(npix1npix2)

        return covmat_ndens

    def calc_covariance_sample_variance(self, density_correlation, sys_maps, diag_blocks_only=False):
        """
        Sample variance term in 1d binned covariance

        uses treecorr to compute the two point function between pixel positions in different SP bins
        see https://github.com/elvinpoole/1dcov/blob/main/notes/1d_covariance_notes.pdf

        Parameters
        ----------
        density_correlation: lsstools.DensityCorrelation

        sys_maps: array of healsparse maps

        """
        import treecorr
        import healpy as hp
        import pyccl
        from scipy.interpolate import interp1d

        nbinstotal = len(density_correlation.map_index)

        # generate theory wtheta
        mintheta = hp.nside2resol(sys_maps[0].nside_sparse, arcmin=True)
        maxtheta = 250.0  # in arcmin
        theta = np.linspace(mintheta, maxtheta, 100)
        with self.open_input("fiducial_cosmology", wrapper=True) as f:
            cosmo = f.to_ccl()
        z_n, nz = self.load_tracer(density_correlation.tomobin)
        theory_ell = np.unique(np.geomspace(1, 3000, 100).astype(int))
        if isinstance(self.config["b0"], list):
            if len(self.config["b0"]) == 1 and self.Ntomo != 1:
                print(
                    "Recieved one linear galaxy bias b0, but you have more than one tomographic bin. Using single value for all bins"
                )
                b0 = self.config["b0"][0]
            else:
                assert len(self.config["b0"]) == self.Ntomo
                b0 = self.config["b0"][density_correlation.tomobin]
        else:
            b0 = self.config["b0"]
        bias = b0 * np.ones(len(z_n))
        gal_tracer = pyccl.NumberCountsTracer(cosmo, dndz=(z_n, nz), bias=(z_n, bias), has_rsd=True)
        C_l = cosmo.angular_cl(gal_tracer, gal_tracer, theory_ell)
        wtheta = cosmo.correlation(
            ell=theory_ell,
            C_ell=C_l,
            theta=theta / 60.0,
            type="NN",
        )
        wtheta_interp = interp1d(theta, wtheta)

        map_list = np.unique(density_correlation.map_index).astype("int")

        # make a dict of treecorr Catalog objects
        # TODO: test how the memory use scales with NSIDE
        cats = {}
        for imap in map_list:
            ra_i, dec_i = sys_maps[imap].valid_pixels_pos(lonlat=True)
            edges_i = density_correlation.get_edges(imap)
            for isp in range(len(edges_i) - 1):
                selecti = density_correlation.precomputed_array[imap][isp]
                cat_i = treecorr.Catalog(
                    ra=ra_i[selecti],
                    dec=dec_i[selecti],
                    ra_units="degrees",
                    dec_units="degrees",
                )
                cats[imap, isp] = cat_i

        # Compute 2point function of the SP map pixel centres
        # This should be the same for all lens bins so only compute it if this is the first bin
        if "sp_pixel_twopoint" not in self.sys_meta.keys():
            self.sys_meta["sp_pixel_twopoint"] = {}
            for imap in map_list:
                edges_i = density_correlation.get_edges(imap)
                print("SV covariance for map", imap)
                for isp in range(len(edges_i) - 1):
                    cat_i = cats[imap, isp]
                    indexi = np.where(density_correlation.map_index == imap)[0][isp]

                    for jmap in map_list:
                        edges_j = density_correlation.get_edges(jmap)
                        for jsp in range(len(edges_j) - 1):
                            indexj = np.where(density_correlation.map_index == jmap)[0][jsp]
                            if indexi > indexj:
                                continue
                            if diag_blocks_only and imap != jmap:  # sometimes we dont need the covariance between maps
                                continue
                            cat_j = cats[jmap, jsp]

                            nn = treecorr.NNCorrelation(
                                max_sep=maxtheta,
                                min_sep=mintheta,
                                nbins=20,
                                sep_units="arcmin",
                            )
                            nn.process(cat_i, cat_j)
                            self.sys_meta["sp_pixel_twopoint"][indexi, indexj] = nn
                            self.sys_meta["sp_pixel_twopoint"][indexj, indexi] = nn

        # now construct the covariance from the Npairs and theory curve
        # Covariance matrix on number *counts*
        covmat_N = np.zeros((nbinstotal, nbinstotal))
        for imap in map_list:
            edges_i = density_correlation.get_edges(imap)
            for isp in range(len(edges_i) - 1):
                indexi = np.where(density_correlation.map_index == imap)[0][isp]
                for jmap in map_list:
                    edges_j = density_correlation.get_edges(jmap)
                    for jsp in range(len(edges_j) - 1):
                        indexj = np.where(density_correlation.map_index == jmap)[0][jsp]
                        if indexi > indexj:
                            continue
                        if diag_blocks_only and imap != jmap:  # sometimes we dont need the covarinace between maps
                            continue
                        nn = self.sys_meta["sp_pixel_twopoint"][indexi, indexj]
                        covmat_N[indexi, indexj] = np.sum(nn.npairs * wtheta_interp(nn.meanr))
                        covmat_N[indexj, indexi] = covmat_N[indexi, indexj]

        # I did not include nbar in covmat_N because this would get divided out here
        npix1npix2 = np.matrix(density_correlation.npix).T * np.matrix(density_correlation.npix)
        covmat_ndens = covmat_N / np.array(npix1npix2)

        return covmat_ndens

    def load_tracer(self, tomobin):
        """
        Load the N(z) (lenses only)
        We need this to compute the theory estimate
        in the SV term in the 1d covariance
        """
        with self.open_input("lens_photoz_stack", wrapper=True) as f_lens:
            z, nz = f_lens.get_bin_n_of_z(tomobin)
        return z, nz

    def verify_config(self):
        """
        Nothing to verify for the null tests
        """
        pass

[docs] class TXLSSWeights(TXLSSDensityBase): """ Base class for LSS systematic weights Not to be run directly. This is an abstract base class, which other subclasses inherit from to use the same basic structure, which is: - load and process sytematic (survey property) maps - compute 1d density correlations+covariance - compute weights with a regression method """ name = "TXLSSWeights" parallel = False inputs = [ ("binned_lens_catalog_unweighted", TomographyCatalog), # this file is used by the stage to compute weights ( "lens_tomography_catalog_unweighted", TomographyCatalog, ), # this file is copied at the end and a weighted version is made (for stages that use this instead of the binned catalogs) ("mask", MapsFile), ("unweighted_density_correlation", HDFFile), ] outputs = [ ("lss_weight_summary", FileCollection), # output files and summary statistics will go here ("weighted_density_correlation", HDFFile), ("lss_weight_maps", MapsFile), # the systematic weight maps to be applied to the lens galaxies ("binned_lens_catalog", HDFFile), # the lens catalog with weights added ("lens_tomography_catalog", HDFFile), # the tomography file with weights added ] config_options = { **TXLSSDensityBase.config_options, } def run(self): """ Follows the basic stucture of a regression based weights pipeline: (1) Prepare survey properties (SP maps) load, degrade, normalize, etc (2) Load 1d density trends Ngal vs SP (3) Compute the covariance matrix of the density trends (4) Compute the weights using a given model (5) Summarize (save plots, regression params, etc) (6) Save the weighted catalog and weight maps Each step can be overwritten in sub-classes """ import sacc import healsparse pixel_scheme = choose_pixelization(**self.config) self.pixel_metadata = pixel_scheme.metadata self.pixel_metadata["nest"] = True #This stage uses healsparse maps which always use nested ordering # check the metadata nside matches the mask (might not be true of you use an external mask) with self.open_input("mask", wrapper=True) as map_file: mask_nside = map_file.read_map_info("mask")["nside"] assert self.pixel_metadata["nside"] == mask_nside # get number of tomographic lens bins with self.open_input("binned_lens_catalog_unweighted", wrapper=False) as f: self.Ntomo = f["lens"].attrs["nbin_lens"] # output directory for the plots and summary stats output_dir = self.open_output("lss_weight_summary", wrapper=True) # open density corr file dens_output = self.open_output("weighted_density_correlation", wrapper=False) # load the SP maps, apply the mask, normalize the maps (as needed by the method) self.sys_maps, self.sys_names, self.sys_meta = self.prepare_sys_maps() mean_density_map_list = [] for ibin in range(self.Ntomo): # load density vs SP map data vector density_corrs = self.load_density_corr(ibin) # compute the weights mean_density_map, fit_output = self.compute_weights(density_corrs) mean_density_map_list.append(mean_density_map) # Calculate the weighted version of the same thing. weighted_density_corrs = self.calculate_1d_density_correlations(ibin, mean_density_map=mean_density_map) # add info from unweighted density_corrs to weighted if weighted_density_corrs is not None: weighted_density_corrs.postprocess(density_corrs) # make summary stats and plots self.summarize_weights(output_dir, dens_output, density_corrs, weighted_density_corrs, fit_output) dens_output.close() # save object weights and weight maps self.save_weights(output_dir, mean_density_map_list) def verify_config(self): """ Checks the SP map config being used for this stage matches that of the input density null tests """ #these config options current have to be the same between density Null calls and LSS weights calls config_asserts = [ "equal_area_bins", "fill_missing_pix", "nbin", "nside", "nside_coverage", "outlier_fraction", "pixelization", "supreme_path_root", ] with self.open_input("unweighted_density_correlation") as f: for c in config_asserts: c_input = str(f["provenance"].attrs[f"config/{c}"]) c_stage = str(self.config[c]) assert c_input == c_stage def load_density_corr(self, ibin): """ Load some 1d density trends from a TXDensityNull-like stage """ from . import lsstools with self.open_input("unweighted_density_correlation", wrapper=False) as f: return lsstools.DensityCorrelation.load_from_group(f[f"density_{ibin}"]) def save_weights(self, output_dir, mean_density_map_list): """ Save the weights maps and galaxy weights Parameters ---------- mean_density_map_list: list of Healsparse maps list of the F(s) maps. one for each tomo bin F(s) = 1/weight """ import healpy as hp import matplotlib.pyplot as plt #### save the weights maps map_output_file = self.open_output("lss_weight_maps", wrapper=True) map_output_file.file.create_group("maps") map_output_file.file["maps"].attrs.update(self.config) for ibin, mean_density_map in enumerate(mean_density_map_list): weight_map = mean_density_map**-1 map_output_file.write_map( f"weight_map_bin_{ibin}", weight_map, self.pixel_metadata ) #### save the binned lens samples # There is probably a better way to do this using the batch writer binned_output = self.open_output("binned_lens_catalog", parallel=True) with self.open_input("binned_lens_catalog_unweighted") as binned_input: binned_output.copy(binned_input["lens"], "lens") # Plot a histogram of the weights fig, axs = plt.subplots(1, self.Ntomo, figsize=(3 * self.Ntomo, 3), squeeze=False) for ibin in range(self.Ntomo): # get weight per lens object subgroup = binned_output[f"lens/bin_{ibin}/"] ra = subgroup["ra"][:] dec = subgroup["dec"][:] # Get the pixel index for each galaxy in our density map. # could switch to using txpipe tools for this? pix = hp.ang2pix(self.pixel_metadata["nside"], ra, dec, lonlat=True, nest=True) # Compute the weight corresponding to each object w = mean_density_map_list[ibin][pix] obj_weight = 1.0 / w obj_weight[w == hp.UNSEEN] = 0.0 subgroup["weight"][:] *= obj_weight # Draw the histogram for this tomographic bin in the correct # panel of the figure axs[0][ibin].hist(obj_weight[obj_weight != 0], bins=100) axs[0][ibin].set_xlabel("weight") axs[0][ibin].set_yticks([]) axs[0][ibin].set_title(f"lens {ibin}") filepath = output_dir.path_for_file(f"weights_hist.png") fig.tight_layout() fig.savefig(filepath) fig.clear() plt.close() #### save the tomography file tomo_output = self.open_output("lens_tomography_catalog", parallel=True) with self.open_input("lens_tomography_catalog_unweighted") as tomo_input: tomo_output.copy(tomo_input["tomography"], "tomography") tomo_output.copy(tomo_input["counts"], "counts") lens_weight = tomo_output["tomography/lens_weight"][:] for ibin in range(self.Ntomo): binned_subgroup = binned_output[f"lens/bin_{ibin}/"] mask = tomo_output["tomography/bin"][:] == ibin lens_weight[mask] *= binned_subgroup["weight"][:] tomo_output["tomography/lens_weight"][:] = lens_weight #### close the outputs map_output_file.close() tomo_output.close() binned_output.close() def summarize_weights(self, output_dir, dens_output, density_correlation, weighted_density_correlation, fit_output): """ make 1d density plots and other summary statistics and save them Parameters ---------- output_dir: string density_correlation: lsstools.DensityCorrelation weighted_density_correlation: lsstools.DensityCorrelation fit_output: dict """ import h5py import scipy.stats ibin = density_correlation.tomobin # make 1d density plots for imap in np.unique(density_correlation.map_index): try: splabel = density_correlation.mapnames[imap] except KeyError: splabel = imap filepath = output_dir.path_for_file(f"sys1D_lens{ibin}_SP{splabel}.png") density_correlation.plot1d_singlemap( filepath, imap, label=None, extra_density_correlations=[weighted_density_correlation], extra_density_labels=["weighted"], ) # save the weighted 1D density trends weighted_density_correlation.save_to_group(dens_output) # tomo bin label is taken from density_correlation # save map names, coefficients and chi2 from the fit into an hdf5 file fit_summary_file_name = output_dir.path_for_file(f"fit_summary_lens{ibin}.hdf5") fit_summary_file = h5py.File(fit_summary_file_name, "w") fit_summary_file.file.create_group("fit_summary") summary_group = fit_summary_file["fit_summary"] if len(fit_output["sig_map_index"]) != 0: summary_group.create_dataset("fitted_map_id", data=fit_output["sig_map_index"]) fitted_maps_names = np.array([density_correlation.mapnames[i] for i in fit_output["sig_map_index"]]) summary_group.create_dataset("fitted_map_names", data=fitted_maps_names.astype(np.bytes_)) summary_group.create_dataset("all_map_names", data=self.sys_names.astype(np.bytes_)) summary_group.create_dataset("coeff", data=fit_output["coeff"]) if fit_output["coeff_cov"] is not None: summary_group.create_dataset("coeff_cov", data=fit_output["coeff_cov"]) for model in density_correlation.chi2.keys(): summary_group.create_dataset( f"chi2_{model}", data=np.array(list(density_correlation.chi2[model].items())).T, ) fit_summary_file.close() # plot the un-weighted and weighted chi2 distribution filepath = output_dir.path_for_file(f"chi2_hist_lens{ibin}.png") if "pvalue_threshold" in self.config.keys(): chi2_threshold = scipy.stats.chi2(self.config["nbin"]).isf(self.config["pvalue_threshold"]) else: chi2_threshold = None density_correlation.plot_chi2_hist( filepath, extra_density_correlations=[weighted_density_correlation], chi2_threshold=chi2_threshold, )
[docs] class TXLSSWeightsLinBinned(TXLSSWeights): """ Compute LSS systematic weights using simultanious linear regression on the binned 1D correlations Model: Linear Covariance: Shot noise (for now), no correlation between 1d correlations Fit: Simultaniously fits all sysmaps. By calculating a total weight map and calculating Ndens vs sysmap directly """ name = "TXLSSWeightsLinBinned" parallel = False config_options = { **TXLSSWeights.config_options, "pvalue_threshold": StageParameter( float, 0.05, msg="max p-value for maps to be included in the corrected (a very simple form of regularization)", ), } def select_maps(self, density_correlation): """ Returns the map indices that have small null p-values (large chi2) Threshold p-value set in config Parameters ---------- density_correlation: lsstools.DensityCorrelation """ import scipy.stats chi2_null = density_correlation.chi2["null"] nbins_per_map = self.config["nbin"] map_index_array = np.array(list(chi2_null.keys())) chi2_array = np.array(list(chi2_null.values())) # pvalues of the null signal for each map p = 1.0 - scipy.stats.chi2(nbins_per_map).cdf(chi2_array) sig_maps = map_index_array[p < self.config["pvalue_threshold"]] return sig_maps def compute_weights(self, density_correlation): """ Least square fit to a simple linear model The function being optimized is a sum of the F(s) maps for each sys map plus a constant Parameters ---------- density_correlation: lsstools.DensityCorrelation Returns ------ mean_density_map: Healsparse map Map of the fitted function F(s) where F(s) = 1/weight coeff_output: (sig_map_index, coeff, coeff_cov) sig_map_index: 1D array index of each map considered significant coeff: 1D array best fit parameters coeff=[beta, alpha1, alpha2, etc] coeff_cov: 2D array covariance matrix of coeff from fit """ import scipy.optimize import healsparse as hsp import healpy as hp from . import lsstools s = time.time() # first add the null signal as the first model # null_model = np.ones(len(density_correlation.ndens)) # density_correlation.add_model(null_model, "null") # select only the significant trends sig_map_index = self.select_maps(density_correlation) sysmap_table_all = hsplist2array(self.sys_maps) if len(sig_map_index) == 0: # there are no maps to correct print("No maps selected for correction") # mean_density_map is all ones mean_density_map_bf = hsp.HealSparseMap.make_empty( self.sys_maps[0].nside_coverage, self.sys_maps[0].nside_sparse, dtype=np.float64, sentinel=hp.UNSEEN, ) mean_density_map_bf.update_values_pix( self.sys_maps[0].valid_pixels, np.ones(len(self.sys_maps[0].valid_pixels)).astype(np.float64), ) fit_output = {} fit_output["sig_map_index"] = sig_map_index fit_output["coeff"] = None fit_output["coeff_cov"] = None else: print("{0} map(s) selected for correction".format(len(sig_map_index))) # get the frac det (TO DO: get frac only, dont need deltag) _, frac = self.get_deltag(density_correlation.tomobin) # initial parameters p0 = np.array([1.0] + [0.0] * len(sig_map_index)) # we need to mask the density correlation to only use the selected maps dc_mask = np.in1d(density_correlation.map_index, sig_map_index) dc_covmat_masked = density_correlation.covmat[dc_mask].T[dc_mask].T dc_ndens_masked = density_correlation.ndens[dc_mask] # negative log likelihood to be minimized def neg_log_like(params): beta = params[0] alphas = params[1:] F, Fdc = lsstools.lsstools.linear_model( beta, *alphas, density_correlation=density_correlation, sys_maps=self.sys_maps, sysmap_table=sysmap_table_all, map_index=sig_map_index, frac=frac, ) chi2 = calc_chi2(dc_ndens_masked, dc_covmat_masked, Fdc.ndens) return chi2 / 2.0 minimize_output = scipy.optimize.minimize(neg_log_like, p0, method="Nelder-Mead") coeff = minimize_output.x coeff_cov = None # make best fit model (including the maps that were not selected) # and add this best fit model to the DensityCorrelation instance best_fit = np.array([1.0] + [0.0] * len(self.sys_maps)) best_fit[0] = coeff[0] best_fit[sig_map_index + 1] = coeff[1:] mean_density_map_bf, Fdc_bf = lsstools.lsstools.linear_model( best_fit[0], *best_fit[1:], density_correlation=density_correlation, sys_maps=self.sys_maps, sysmap_table=sysmap_table_all, frac=frac, ) density_correlation.add_model(Fdc_bf.ndens, "linear") # assemble the fitting outputs (chi2, values, coefficents, ) fit_output = {} fit_output["sig_map_index"] = sig_map_index fit_output["coeff"] = coeff fit_output["coeff_cov"] = coeff_cov f = time.time() print("compute_weights took {0}s".format(f - s)) return mean_density_map_bf, fit_output
[docs] class TXLSSWeightsLinPix(TXLSSWeightsLinBinned): """ Compute LSS systematic weights using simultanious linear regression at the pixel level using scikitlearn simple linear regression Model: Linear 1D Covarinace: Shot noise (for now), no correlation between 1d correlations Pixel Covarinace: Shot noise, no correlation between pixels Fit: Simultaniously fits all sysmaps using sklearn """ name = "TXLSSWeightsLinPix" parallel = False config_options = { **TXLSSWeights.config_options, "pvalue_threshold": StageParameter(float, 0.05, msg="max p-value for maps to be corrected"), "regression_class": StageParameter( str, "LinearRegression", msg="sklearn.linear_model class to use in regression" ), } def compute_weights(self, density_correlation): """ Least square fit to a simple linear model at pixel level using p-value of binned 1d trends for regularisation Parameters ---------- density_correlation: lsstools.DensityCorrelation Returns ------ mean_density_map: Healsparse map Map of the fitted function F(s) where F(s) = 1/weight coeff_output: (sig_map_index, coeff, coeff_cov) sig_map_index: 1D array index of each map considered significant coeff: 1D array best fit parameters coeff=[beta, alpha1, alpha2, etc] coeff_cov: 2D array covariance matrix of coeff from fit """ import scipy.optimize import healpy as hp import healsparse as hsp from . import lsstools from sklearn import linear_model as sk_linear_model s = time.time() # first add the null signal as the first model null_model = np.ones(len(density_correlation.ndens)) density_correlation.add_model(null_model, "null") # select only the significant trends sig_map_index = self.select_maps(density_correlation) sysmap_table_all = hsplist2array(self.sys_maps) if len(sig_map_index) == 0: # there are no maps to correct print("0 maps selected for correction") # mean_density_map is all ones mean_density_map_bf = hsp.HealSparseMap.make_empty( self.sys_maps[0].nside_coverage, self.sys_maps[0].nside_sparse, dtype=np.float64, sentinel=hp.UNSEEN, ) mean_density_map_bf.update_values_pix( self.sys_maps[0].valid_pixels, np.ones(len(self.sys_maps[0].valid_pixels)).astype(np.float64), ) fit_output = {} fit_output["sig_map_index"] = sig_map_index fit_output["coeff"] = None fit_output["coeff_cov"] = None else: # there are maps to correct print("{0} map(s) selected for correction".format(len(sig_map_index))) sysmap_table_selected = sysmap_table_all[sig_map_index] # make the delta_g map and fracdet weights deltag, frac = self.get_deltag(density_correlation.tomobin) dg1 = deltag[self.sys_maps[0].valid_pixels] + 1.0 # deltag+1 for valid pixels weight = frac[self.sys_maps[0].valid_pixels] # weight for samples (prop to 1/sigma^2) # do linear regression if self.config["regression_class"].lower() == "linearregression": reg = sk_linear_model.LinearRegression() reg.fit(sysmap_table_selected.T, dg1, sample_weight=weight) elif self.config["regression_class"].lower() == "elasticnetcv": reg = sk_linear_model.ElasticNetCV() reg.fit(sysmap_table_selected.T, dg1, sample_weight=weight) else: raise IOError("regression method {0} not yet implemented".format(self.config["regression_class"])) # get output of regression Fvals = reg.predict(sysmap_table_selected.T) mean_density_map_bf = hsp.HealSparseMap.make_empty( self.sys_maps[0].nside_coverage, self.sys_maps[0].nside_sparse, dtype=np.float64, sentinel=hp.UNSEEN, ) mean_density_map_bf.update_values_pix(self.sys_maps[0].valid_pixels, Fvals.astype(np.float64)) coeff = np.append(reg.intercept_, reg.coef_) coeff_cov = None # make the 1d trends for the regression output Fdc_bf = lsstools.DensityCorrelation() Fdc_bf.set_precompute(density_correlation) for imap, sys_vals in enumerate(sysmap_table_all): edges = density_correlation.get_edges(imap) Fdc_bf.add_correlation( imap, edges, sys_vals, mean_density_map_bf[mean_density_map_bf.valid_pixels], frac=frac[mean_density_map_bf.valid_pixels], map_input=True, use_precompute=True, ) density_correlation.add_model(Fdc_bf.ndens, "linear") # assemble the fitting outputs (chi2, values, coefficents, ) fit_output = {} fit_output["sig_map_index"] = sig_map_index fit_output["coeff"] = coeff fit_output["coeff_cov"] = coeff_cov f = time.time() print("compute_weights took {0}s".format(f - s)) return mean_density_map_bf, fit_output
def hsplist2array(hsp_list): """ Convert a list of healsparse maps to a 2d array of the valid pixels Assume all maps have the same mask """ validpixels = hsp_list[0].valid_pixels out_array = [] for i in range(len(hsp_list)): out_array.append(hsp_list[i][validpixels]) return np.array(out_array)
[docs] class TXLSSWeightsUnit(TXLSSWeights): """ Assign weight=1 to all lens objects """ name = "TXLSSWeightsUnit" parallel = False inputs = [ ("binned_lens_catalog_unweighted", TomographyCatalog), # this file is used by the stage to compute weights ( "lens_tomography_catalog_unweighted", TomographyCatalog, ), # this file is copied at the end and a weighted version is made (for stages that use this instead of the binned catalogs) ("mask", MapsFile), ] outputs = [ ("lss_weight_summary", FileCollection), # output files and summary statistics will go here ("weighted_density_correlation", HDFFile), ("lss_weight_maps", MapsFile), # the systematic weight maps to be applied to the lens galaxies ("binned_lens_catalog", HDFFile), # the lens catalog with weights added ("lens_tomography_catalog", HDFFile), # the tomography file with weights added ] config_options = { "nside_coverage": StageParameter(int, 32, msg="HEALPix nside for coverage maps."), } def prepare_sys_maps(self): """ For unit weights we dont need to load anything """ sys_maps = sys_names = sys_meta = None return sys_maps, sys_names, sys_meta def calculate_1d_density_correlations(self, tomobin, mean_density_map=None): """ For unit weights we dont need 1d density trends """ return None def calc_covariance(self, density_correlation): """ For unit weights we dont need 1d density trends """ return None def load_density_corr(self, ibin): """ For unit weights we dont need 1d density trends """ return None def summarize_weights(self, output_dir, dens_output, density_correlation, weighted_density_correlation, fit_output): """ For unit weights we have nothing to summarize """ pass def compute_weights(self, density_correlation): """ Creates a healsparse map of unit weights Uses the mask directly """ import healpy as hp import healsparse as hsp with self.open_input("mask", wrapper=True) as map_file: mask = map_file.read_map("mask") metadata = map_file.read_map_info("mask") nside = metadata["nside"] nside_coverage = self.config["nside_coverage"] nside_sparse = nside validpixels = mask.valid_pixels mean_density_map = hsp.HealSparseMap.make_empty( nside_coverage, nside_sparse, dtype=np.float64, sentinel=hp.UNSEEN ) mean_density_map.update_values_pix(validpixels, 1.0) fit_output = None return mean_density_map, fit_output