diff --git a/.github/workflows/fireci.yml b/.github/workflows/fireci.yml
index 7b5f7109da0..8228fa10728 100644
--- a/.github/workflows/fireci.yml
+++ b/.github/workflows/fireci.yml
@@ -18,8 +18,10 @@ jobs:
       - uses: actions/checkout@v3.0.2
       - uses: actions/setup-python@v2
         with:
-          python-version: '3.9'
+          python-version: '3.8'
       - run: |
           pip install -e "ci/fireci[test]"
       - run: |
           pytest ci/fireci
+      - run: |
+          mypy --config-file ci/fireci/setup.cfg ci/fireci/ 
diff --git a/.gitignore b/.gitignore
index 300f5bb4b2f..da3e77d46fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ firebase-crashlytics-ndk/.externalNativeBuild/
 firebase-crashlytics-ndk/.cxx/
 smoke-test-logs/
 smoke-tests/build-debug-headGit-smoke-test
-smoke-tests/firehorn.log
\ No newline at end of file
+smoke-tests/firehorn.log
+macrobenchmark-output.json
diff --git a/ci/fireci/fireci/internal.py b/ci/fireci/fireci/internal.py
index c76123e3228..0950d770fc2 100644
--- a/ci/fireci/fireci/internal.py
+++ b/ci/fireci/fireci/internal.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import click
-import contextlib
 import functools
 import glob
 import itertools
@@ -21,6 +20,7 @@
 import os
 import shutil
 
+from contextlib import contextmanager, nullcontext
 
 _logger = logging.getLogger('fireci')
 
@@ -30,7 +30,7 @@ def _ensure_dir(directory):
     os.makedirs(directory)
 
 
-@contextlib.contextmanager
+@contextmanager
 def _artifact_handler(target_directory, artifact_patterns):
   _logger.debug(
       'Artifacts will be searched for in directories matching {} patterns and placed in {}'
@@ -45,7 +45,7 @@ def _artifact_handler(target_directory, artifact_patterns):
       target_name = os.path.join(target_directory, "_".join(path.split('/')))
       _logger.debug('Copying artifact {} to {}'.format(path, target_name))
       if os.path.isdir(path):
-        shutil.copytree(path, target_name)
+        shutil.copytree(path, target_name, dirs_exist_ok=True)
       else:
         shutil.copyfile(path, target_name)
 
@@ -68,8 +68,8 @@ class _CommonOptions:
     '--artifact-patterns',
     default=('**/build/test-results', '**/build/reports'),
     help=
-    'Shell-style artifact patterns that are copied into `artifact-target-dir`.'\
-        'Can be specified multiple times.',
+    'Shell-style artifact patterns that are copied into `artifact-target-dir`. '
+    'Can be specified multiple times.',
     multiple=True,
     type=str,
 )
@@ -83,30 +83,34 @@ def main(options, **kwargs):
     setattr(options, k, v)
 
 
-def ci_command(name=None):
+def ci_command(name=None, cls=click.Command, group=main):
   """Decorator to use for CI commands.
 
        The differences from the standard @click.command are:
 
        * Allows configuration of artifacts that are uploaded for later viewing in CI.
-       * Registers the command automatically
+       * Registers the command automatically.
 
-       :param name: Optional name of the task. Defaults to the function name that is decorated with
-                    this decorator.
+       :param name:  Optional name of the task. Defaults to the function name that is decorated with this decorator.
+       :param cls:   Specifies whether the func is a command or a command group. Defaults to `click.Command`.
+       :param group: Specifies the group the command belongs to. Defaults to the `main` command group.
     """
 
   def ci_command(f):
     actual_name = f.__name__ if name is None else name
 
-    @main.command(name=actual_name, help=f.__doc__)
+    @click.command(name=actual_name, cls=cls, help=f.__doc__)
     @_pass_options
     @click.pass_context
     def new_func(ctx, options, *args, **kwargs):
       with _artifact_handler(
           options.artifact_target_dir,
-          options.artifact_patterns):
+          options.artifact_patterns,
+      ) if cls is click.Command else nullcontext():
         return ctx.invoke(f, *args, **kwargs)
 
+    group.add_command(new_func)
+
     return functools.update_wrapper(new_func, f)
 
   return ci_command
diff --git a/ci/fireci/fireci/plugins.py b/ci/fireci/fireci/plugins.py
index 66aebd30f8c..715c8de0884 100644
--- a/ci/fireci/fireci/plugins.py
+++ b/ci/fireci/fireci/plugins.py
@@ -27,7 +27,7 @@ def discover():
      Note: plugins *must* define the `firebaseplugins` package as a namespace package.
            See: https://packaging.python.org/guides/packaging-namespace-packages/
   """
-  modules = pkgutil.iter_modules(fireciplugins.__path__,
-                                 fireciplugins.__name__ + ".")
+  modules = pkgutil.walk_packages(fireciplugins.__path__,
+                                  fireciplugins.__name__ + ".")
   for _, name, _ in modules:
     importlib.import_module(name)
diff --git a/ci/fireci/fireciplugins/macrobenchmark.py b/ci/fireci/fireciplugins/macrobenchmark.py
deleted file mode 100644
index a0fc2f81a39..00000000000
--- a/ci/fireci/fireciplugins/macrobenchmark.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import glob
-import json
-import logging
-import os
-import random
-import re
-import shutil
-import sys
-import tempfile
-import uuid
-
-import click
-import numpy
-import pystache
-import yaml
-from google.cloud import storage
-
-from fireci import ci_command
-from fireci import ci_utils
-from fireci import uploader
-from fireci.dir_utils import chdir
-
-_logger = logging.getLogger('fireci.macrobenchmark')
-
-
-@click.option(
-  '--build-only/--no-build-only',
-  default=False,
-  help='Whether to only build tracing test apps or to also run them on FTL afterwards'
-)
-@ci_command()
-def macrobenchmark(build_only):
-  """Measures app startup times for Firebase SDKs."""
-  asyncio.run(_launch_macrobenchmark_test(build_only))
-
-
-async def _launch_macrobenchmark_test(build_only):
-  _logger.info('Starting macrobenchmark test...')
-
-  artifact_versions = await _assemble_all_artifacts()
-  _logger.info(f'Artifact versions: {artifact_versions}')
-
-  test_dir = await _prepare_test_directory()
-  _logger.info(f'Directory for test apps: {test_dir}')
-
-  config = await _process_config_yaml()
-  _logger.info(f'Processed yaml configurations: {config}')
-
-  tests = [MacrobenchmarkTest(app, artifact_versions, os.getcwd(), test_dir) for app in config['test-apps']]
-
-  _logger.info(f'Building {len(tests)} macrobenchmark test apps...')
-  # TODO(yifany): investigate why it is much slower with asyncio.gather
-  #   - on corp workstations (9 min) than M1 macbook pro (3 min)
-  #   - with gradle 7.5.1 (9 min) than gradle 6.9.2 (5 min)
-  # await asyncio.gather(*[x.build() for x in tests])
-  for test in tests:
-    await test.build()
-
-  if not build_only:
-    _logger.info(f'Submitting {len(tests)} tests to Firebase Test Lab...')
-    results = await asyncio.gather(*[x.test() for x in tests], return_exceptions=True)
-    await _post_processing(results)
-
-  _logger.info('Macrobenchmark test finished.')
-
-
-async def _assemble_all_artifacts():
-  await (await asyncio.create_subprocess_exec('./gradlew', 'assembleAllForSmokeTests')).wait()
-
-  with open('build/m2repository/changed-artifacts.json') as json_file:
-    artifacts = json.load(json_file)
-  return dict(_artifact_key_version(x) for x in artifacts['headGit'])
-
-
-def _artifact_key_version(artifact):
-  group_id, artifact_id, version = artifact.split(':')
-  return f'{group_id}:{artifact_id}', version
-
-
-async def _process_config_yaml():
-  with open('health-metrics/benchmark/config.yaml') as yaml_file:
-    config = yaml.safe_load(yaml_file)
-    for app in config['test-apps']:
-      app['plugins'] = app.get('plugins', [])
-      app['traces'] = app.get('traces', [])
-      app['plugins'].extend(config['common-plugins'])
-      app['traces'].extend(config['common-traces'])
-    return config
-
-
-async def _prepare_test_directory():
-  test_dir = tempfile.mkdtemp(prefix='benchmark-test-')
-
-  # Required for creating gradle wrapper, as the dir is not defined in the root settings.gradle
-  open(os.path.join(test_dir, 'settings.gradle'), 'w').close()
-
-  command = ['./gradlew', 'wrapper', '--gradle-version', '7.5.1', '--project-dir', test_dir]
-  await (await asyncio.create_subprocess_exec(*command)).wait()
-
-  return test_dir
-
-
-async def _post_processing(results):
-  _logger.info(f'Macrobenchmark results: {results}')
-
-  if os.getenv('CI') is None:
-    _logger.info('Running locally. Results upload skipped.')
-    return
-
-  # Upload successful measurements to the metric service
-  measurements = []
-  for result in results:
-    if not isinstance(result, Exception):
-      measurements.extend(result)
-
-  log = ci_utils.ci_log_link()
-  test_report = {'benchmarks': measurements, 'log': log}
-
-  metrics_service_url = 'https://api.firebase-sdk-health-metrics.com'
-  access_token = ci_utils.gcloud_identity_token()
-  uploader.post_report(test_report, metrics_service_url, access_token, 'macrobenchmark')
-
-  # Raise exceptions for failed measurements
-  if any(map(lambda x: isinstance(x, Exception), results)):
-    _logger.error(f'Exceptions: {[x for x in results if isinstance(x, Exception)]}')
-    raise click.ClickException('Macrobenchmark test failed with above errors.')
-
-
-class MacrobenchmarkTest:
-  """Builds the test based on configurations and runs the test on FTL."""
-  def __init__(
-      self,
-      test_app_config,
-      artifact_versions,
-      repo_root_dir,
-      test_dir,
-      logger=_logger
-  ):
-    self.test_app_config = test_app_config
-    self.artifact_versions = artifact_versions
-    self.repo_root_dir = repo_root_dir
-    self.test_dir = test_dir
-    self.logger = MacrobenchmarkLoggerAdapter(logger, test_app_config['sdk'])
-    self.test_app_dir = os.path.join(test_dir, test_app_config['name'])
-    self.test_results_bucket = 'fireescape-benchmark-results'
-    self.test_results_dir = str(uuid.uuid4())
-    self.gcs_client = storage.Client()
-
-  async def build(self):
-    """Creates test app project and assembles app and test apks."""
-    await self._create_benchmark_projects()
-    await self._assemble_benchmark_apks()
-
-  async def test(self):
-    """Runs benchmark tests on FTL and fetches FTL results from GCS."""
-    await self._execute_benchmark_tests()
-    return await self._aggregate_benchmark_results()
-
-  async def _create_benchmark_projects(self):
-    app_name = self.test_app_config['name']
-    self.logger.info(f'Creating test app "{app_name}"...')
-
-    self.logger.info(f'Copying project template files into "{self.test_app_dir}"...')
-    template_dir = os.path.join(self.repo_root_dir, 'health-metrics/benchmark/template')
-    shutil.copytree(template_dir, self.test_app_dir)
-
-    self.logger.info(f'Copying gradle wrapper binary into "{self.test_app_dir}"...')
-    shutil.copy(os.path.join(self.test_dir, 'gradlew'), self.test_app_dir)
-    shutil.copy(os.path.join(self.test_dir, 'gradlew.bat'), self.test_app_dir)
-    shutil.copytree(os.path.join(self.test_dir, 'gradle'), os.path.join(self.test_app_dir, 'gradle'))
-
-    with chdir(self.test_app_dir):
-      mustache_context = await self._prepare_mustache_context()
-      renderer = pystache.Renderer()
-      mustaches = glob.glob('**/*.mustache', recursive=True)
-      for mustache in mustaches:
-        self.logger.info(f'Processing template file: {mustache}')
-        result = renderer.render_path(mustache, mustache_context)
-        original_name = mustache.removesuffix('.mustache')
-        with open(original_name, 'w') as file:
-          file.write(result)
-
-  async def _assemble_benchmark_apks(self):
-    with chdir(self.test_app_dir):
-      await self._exec_subprocess('./gradlew', ['assemble'])
-
-  async def _execute_benchmark_tests(self):
-    app_apk_path = glob.glob(f'{self.test_app_dir}/**/app-benchmark.apk', recursive=True)[0]
-    test_apk_path = glob.glob(f'{self.test_app_dir}/**/macrobenchmark-benchmark.apk', recursive=True)[0]
-
-    self.logger.info(f'App apk: {app_apk_path}')
-    self.logger.info(f'Test apk: {test_apk_path}')
-
-    ftl_environment_variables = [
-      'clearPackageData=true',
-      'additionalTestOutputDir=/sdcard/Download',
-      'no-isolated-storage=true',
-    ]
-    executable = 'gcloud'
-    args = ['firebase', 'test', 'android', 'run']
-    args += ['--type', 'instrumentation']
-    args += ['--app', app_apk_path]
-    args += ['--test', test_apk_path]
-    args += ['--device', 'model=oriole,version=32,locale=en,orientation=portrait']
-    args += ['--directories-to-pull', '/sdcard/Download']
-    args += ['--results-bucket', f'gs://{self.test_results_bucket}']
-    args += ['--results-dir', self.test_results_dir]
-    args += ['--environment-variables', ','.join(ftl_environment_variables)]
-    args += ['--timeout', '30m']
-    args += ['--project', 'fireescape-c4819']
-
-    await self._exec_subprocess(executable, args)
-
-  async def _prepare_mustache_context(self):
-    mustache_context = {
-      'm2repository': os.path.join(self.repo_root_dir, 'build/m2repository'),
-      'plugins': self.test_app_config.get('plugins', []),
-      'traces': self.test_app_config.get('traces', []),
-      'dependencies': [],
-    }
-
-    if 'dependencies' in self.test_app_config:
-      for dep in self.test_app_config['dependencies']:
-        if '@' in dep:
-          key, version = dep.split('@', 1)
-          dependency = {'key': key, 'version': version}
-        else:
-          dependency = {'key': dep, 'version': self.artifact_versions[dep]}
-        mustache_context['dependencies'].append(dependency)
-
-    return mustache_context
-
-  async def _aggregate_benchmark_results(self):
-    results = []
-    blobs = self.gcs_client.list_blobs(self.test_results_bucket, prefix=self.test_results_dir)
-    files = [x for x in blobs if re.search(r'sdcard/Download/[^/]*\.json', x.name)]
-    for file in files:
-      device = re.search(r'([^/]*)/artifacts/', file.name).group(1)
-      benchmarks = json.loads(file.download_as_bytes())['benchmarks']
-      for benchmark in benchmarks:
-        method = benchmark['name']
-        clazz = benchmark['className'].split('.')[-1]
-        runs = benchmark['metrics']['timeToInitialDisplayMs']['runs']
-        results.append({
-          'sdk': self.test_app_config['sdk'],
-          'device': device,
-          'name': f'{clazz}.{method}',
-          'min': min(runs),
-          'max': max(runs),
-          'p50': numpy.percentile(runs, 50),
-          'p90': numpy.percentile(runs, 90),
-          'p99': numpy.percentile(runs, 99),
-          'unit': 'ms',
-        })
-    self.logger.info(f'Benchmark results: {results}')
-    return results
-
-  async def _exec_subprocess(self, executable, args):
-    command = " ".join([executable, *args])
-    self.logger.info(f'Executing command: "{command}"...')
-
-    proc = await asyncio.subprocess.create_subprocess_exec(
-      executable,
-      *args,
-      stdout=asyncio.subprocess.PIPE,
-      stderr=asyncio.subprocess.PIPE
-    )
-    await asyncio.gather(
-      self._stream_output(executable, proc.stdout),
-      self._stream_output(executable, proc.stderr)
-    )
-
-    await proc.communicate()
-    if proc.returncode == 0:
-      self.logger.info(f'"{command}" finished.')
-    else:
-      message = f'"{command}" exited with return code {proc.returncode}.'
-      self.logger.error(message)
-      raise click.ClickException(message)
-
-  async def _stream_output(self, executable, stream: asyncio.StreamReader):
-    async for line in stream:
-      self.logger.info(f'[{executable}] {line.decode("utf-8").strip()}')
-
-
-class MacrobenchmarkLoggerAdapter(logging.LoggerAdapter):
-  """Decorates log messages for a sdk to make them more distinguishable."""
-
-  reset_code = '\x1b[m'
-
-  @staticmethod
-  def random_color_code():
-    code = random.randint(16, 231)  # https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit
-    return f'\x1b[38;5;{code}m'
-
-  def __init__(self, logger, sdk_name, color_code=None):
-    super().__init__(logger, {})
-    self.sdk_name = sdk_name
-    self.color_code = self.random_color_code() if color_code is None else color_code
-
-  def process(self, msg, kwargs):
-    colored = f'{self.color_code}[{self.sdk_name}]{self.reset_code} {msg}'
-    uncolored = f'[{self.sdk_name}] {msg}'
-    return colored if sys.stderr.isatty() else uncolored, kwargs
diff --git a/ci/fireci/fireciplugins/macrobenchmark/__init__.py b/ci/fireci/fireciplugins/macrobenchmark/__init__.py
new file mode 100644
index 00000000000..6d6d1266c32
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ci/fireci/fireciplugins/macrobenchmark/analyze/__init__.py b/ci/fireci/fireciplugins/macrobenchmark/analyze/__init__.py
new file mode 100644
index 00000000000..6d6d1266c32
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/analyze/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ci/fireci/fireciplugins/macrobenchmark/analyze/aggregator.py b/ci/fireci/fireciplugins/macrobenchmark/analyze/aggregator.py
new file mode 100644
index 00000000000..5b75e3f2678
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/analyze/aggregator.py
@@ -0,0 +1,79 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import pandas as pd
+import seaborn as sns
+
+from pathlib import Path
+
+logger = logging.getLogger('fireci.macrobenchmark')
+sns.set()
+
+
+def calculate_statistic(trace: str, device: str, data: pd.DataFrame, output_dir: Path = None):
+  logger.info(f'Calculating statistics for trace "{trace}" on device "{device}" ...')
+
+  # Calculate percentiles per each run_id
+  quantiles = [0.1, 0.25, 0.5, 0.75, 0.9]
+  percentiles = data.groupby('run_id').quantile(quantiles, numeric_only=True)
+  percentiles.index.set_names('percentile', level=1, inplace=True)
+  percentiles = percentiles.reset_index(['run_id', 'percentile'])
+  percentiles = percentiles.pivot(index='run_id', columns='percentile', values='duration')
+
+  def mapper(quantile: float) -> str: return f'p{int(quantile * 100)}'
+
+  percentiles.rename(mapper=mapper, axis='columns', inplace=True)
+
+  # Calculate dispersions of each percentile over all runs
+  mean = percentiles.mean()
+  std = percentiles.std()  # standard deviation
+  cv = std / mean  # coefficient of variation (relative standard deviation)
+  mad = (percentiles - percentiles.mean()).abs().mean()  # mean absolute deviation
+  rmad = mad / mean  # relative mean absolute deviation (mad / mean)
+  dispersions = pd.DataFrame([pd.Series(cv, name='cv'), pd.Series(rmad, name='rmad')])
+
+  # Optionally save percentiles and dispersions to file
+  if output_dir:
+    percentiles.to_json(output_dir.joinpath('percentiles.json'), orient='index')
+    dispersions.to_json(output_dir.joinpath('dispersions.json'), orient='index')
+    logger.info(f'Percentiles and dispersions saved in: {output_dir}')
+
+  return percentiles, dispersions
+
+
+def calculate_statistic_diff(
+    trace: str,
+    device: str,
+    control: pd.DataFrame,
+    experimental: pd.DataFrame,
+    output_dir: Path = None,
+):
+  logger.info(f'Calculating statistic diff for trace "{trace}" on device "{device}" ...')
+
+  ctl_percentiles, _ = calculate_statistic(trace, device, control)
+  exp_percentiles, _ = calculate_statistic(trace, device, experimental)
+
+  ctl_mean = ctl_percentiles.mean()
+  exp_mean = exp_percentiles.mean()
+
+  delta = exp_mean - ctl_mean
+  percentage = delta / ctl_mean
+
+  # Optionally save statistics to file
+  if output_dir:
+    delta.to_json(output_dir.joinpath('delta.json'))
+    percentage.to_json(output_dir.joinpath('percentage.json'))
+    logger.info(f'Percentiles diff saved in: {output_dir}')
diff --git a/ci/fireci/fireciplugins/macrobenchmark/analyze/analyzer.py b/ci/fireci/fireciplugins/macrobenchmark/analyze/analyzer.py
new file mode 100644
index 00000000000..86b8ec4ca76
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/analyze/analyzer.py
@@ -0,0 +1,104 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import tempfile
+import pandas as pd
+
+from .aggregator import calculate_statistic, calculate_statistic_diff
+from .plotter import plot_graph, plot_diff_graph
+from .utils import collect_data_points, DataPoint
+from click import progressbar
+from pathlib import Path
+from typing import List
+
+
+logger = logging.getLogger('fireci.macrobenchmark')
+
+
+def start(
+    diff_mode: bool,
+    ftl_results_dir: List[str],
+    local_reports_dir: Path,
+    ctl_ftl_results_dir: List[str],
+    ctl_local_reports_dir: Path,
+    exp_ftl_results_dir: List[str],
+    exp_local_reports_dir: Path,
+    output_dir: Path
+):
+  logger.info('Starting to analyze macrobenchmark test results ...')
+
+  if not output_dir:
+    output_dir = Path(tempfile.mkdtemp(prefix='macrobenchmark-analysis-'))
+    logger.info(f'Created temporary dir "{output_dir}" to save analysis results')
+
+  if not diff_mode:
+    data_points = collect_data_points(ftl_results_dir, local_reports_dir)
+    _process(data_points, output_dir)
+  else:
+    logger.info('Running in diff mode ...')
+    ctl_data_points = collect_data_points(ctl_ftl_results_dir, ctl_local_reports_dir)
+    exp_data_points = collect_data_points(exp_ftl_results_dir, exp_local_reports_dir)
+    _diff(ctl_data_points, exp_data_points, output_dir)
+
+  logger.info(f'Completed analysis and saved output in: {output_dir}')
+
+
+def _process(data_points: List[DataPoint], output_dir: Path) -> None:
+  data = pd.DataFrame(data_points)
+  traces = sorted(data['trace'].unique())
+  devices = sorted(data['device'].unique())
+
+  trace_device_combinations = [(trace, device) for trace in traces for device in devices]
+
+  with progressbar(trace_device_combinations) as combinations:
+    for trace, device in combinations:
+      combination_dir = output_dir.joinpath(trace, device)
+      combination_dir.mkdir(parents=True, exist_ok=True)
+      subset = _filter_subset(data, trace, device)
+      calculate_statistic(trace, device, subset, combination_dir)
+      plot_graph(trace, device, subset, combination_dir)
+
+
+def _diff(
+    ctl_data_points: List[DataPoint],
+    exp_data_points: List[DataPoint],
+    output_dir: Path
+) -> None:
+  ctl_data = pd.DataFrame(ctl_data_points)
+  exp_data = pd.DataFrame(exp_data_points)
+  all_data = pd.concat([ctl_data, exp_data])
+
+  traces = sorted(all_data['trace'].unique())
+  devices = sorted(all_data['device'].unique())
+
+  trace_device_combinations = [(trace, device) for trace in traces for device in devices]
+
+  with progressbar(trace_device_combinations) as combinations:
+    for trace, device in combinations:
+      combination_dir = output_dir.joinpath(trace, device)
+      combination_dir.mkdir(parents=True, exist_ok=True)
+
+      ctl_subset = _filter_subset(ctl_data, trace, device)
+      exp_subset = _filter_subset(exp_data, trace, device)
+
+      calculate_statistic_diff(trace, device, ctl_subset, exp_subset, combination_dir)
+      plot_diff_graph(trace, device, ctl_subset, exp_subset, combination_dir)
+
+
+def _filter_subset(data: pd.DataFrame, trace: str, device: str) -> pd.DataFrame:
+  return data.loc[
+    (data['trace'] == trace) & (data['device'] == device),
+    ['duration', 'run_id']
+  ]
diff --git a/ci/fireci/fireciplugins/macrobenchmark/analyze/plotter.py b/ci/fireci/fireciplugins/macrobenchmark/analyze/plotter.py
new file mode 100644
index 00000000000..ac73f80815c
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/analyze/plotter.py
@@ -0,0 +1,70 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+from pathlib import Path
+
+
+logger = logging.getLogger('fireci.macrobenchmark')
+sns.set()
+
+
+def plot_graph(trace: str, device: str, data: pd.DataFrame, output_dir: Path):
+  logger.info(f'Plotting graphs for trace "{trace}" on device "{device}" ...')
+
+  unique_run_ids = len(data['run_id'].unique())
+  col_wrap = int(np.ceil(np.sqrt(unique_run_ids)))
+
+  histograms = sns.displot(data=data, x='duration', kde=True, col="run_id", col_wrap=col_wrap)
+  histograms.set_axis_labels(x_var=f'{trace} (ms)')
+  histograms.set_titles(f'{device} ({{col_var}} = {{col_name}})')
+  histograms.savefig(output_dir.joinpath('histograms.svg'))
+
+  distributions = sns.displot(
+    data=data, x='duration', kde=True, height=8,
+    hue='run_id', palette='muted', multiple='dodge'
+  )
+  distributions.set_axis_labels(x_var=f'{trace} (ms)').set(title=device)
+  distributions.savefig(output_dir.joinpath('distributions.svg'))
+
+  logger.info(f'Graphs saved in: {output_dir}')
+
+
+def plot_diff_graph(
+    trace: str,
+    device: str,
+    control: pd.DataFrame,
+    experimental: pd.DataFrame,
+    output_dir: Path
+):
+  logger.info(f'Plotting distribution diff graph for trace "{trace}" on device "{device}" ...')
+
+  control_run_ids = control['run_id']
+  experimental_run_ids = experimental['run_id']
+  all_data = pd.concat([control, experimental])
+
+  palette = {**{x: 'b' for x in control_run_ids}, **{x: 'r' for x in experimental_run_ids}}
+
+  distribution_diff = sns.displot(
+    data=all_data, x='duration', kde=True, height=8,
+    hue='run_id', palette=palette, multiple='dodge'
+  )
+  distribution_diff.set_axis_labels(x_var=f'{trace} (ms)').set(title=device)
+  distribution_diff.savefig(output_dir.joinpath('distribution_diff.svg'))
+
+  logger.info(f'Graph saved in: {output_dir}')
diff --git a/ci/fireci/fireciplugins/macrobenchmark/analyze/utils.py b/ci/fireci/fireciplugins/macrobenchmark/analyze/utils.py
new file mode 100644
index 00000000000..131bb909a83
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/analyze/utils.py
@@ -0,0 +1,82 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import re
+import tempfile
+
+from click import ClickException
+from google.cloud import storage
+from pathlib import Path
+from typing import List, TypedDict
+
+
+logger = logging.getLogger('fireci.macrobenchmark')
+DataPoint = TypedDict('DataPoint', {'duration': float, 'device': str, 'trace': str, 'run_id': str})
+
+
+def collect_data_points(ftl_results_dir: List[str], local_reports_dir: Path) -> List[DataPoint]:
+  if not ftl_results_dir and not local_reports_dir:
+    raise ClickException('Neither ftl-results-dir or local-reports-dir is provided.')
+  elif ftl_results_dir and not local_reports_dir:
+    temp_dir = _download(ftl_results_dir)
+    return _extract_raw_data(temp_dir)
+  elif not ftl_results_dir and local_reports_dir:
+    return _extract_raw_data(local_reports_dir)
+  else:
+    raise ClickException('Should specify either ftl-results-dir or local-reports-dir, not both.')
+
+
+def _download(ftl_results_dirs: List[str]) -> Path:
+  ftl_results_bucket = 'fireescape-benchmark-results'
+  gcs = storage.Client()
+
+  temp_dir = tempfile.mkdtemp(prefix='ftl-results-')
+  for ftl_results_dir in ftl_results_dirs:
+    blobs = gcs.list_blobs(ftl_results_bucket, prefix=ftl_results_dir)
+    files = [f for f in blobs if f.name.endswith('.json')]
+    for file in files:
+      device = re.search(r'([^/]*)/artifacts/', file.name).group(1)
+      report_dir = Path(temp_dir).joinpath(ftl_results_dir, device)
+      report_dir.mkdir(parents=True, exist_ok=True)
+      filename = file.name.split('/')[-1]
+      file.download_to_filename(report_dir.joinpath(filename))
+      logger.info(f'Downloaded "{file.name}" to "{report_dir}"')
+
+  return Path(temp_dir)
+
+
+def _extract_raw_data(test_reports_dir: Path) -> List[DataPoint]:
+  data_points: List[DataPoint] = []
+  reports = sorted(list(test_reports_dir.rglob("*-benchmarkData.json")))
+  for report in reports:
+    logger.info(f'Processing "{report}" ...')
+
+    run_id = str(report.relative_to(test_reports_dir)).split('/')[0]
+    with open(report) as file:
+      obj = json.load(file)
+      build_context = obj['context']['build']
+      device = f'{build_context["device"]}-{build_context["version"]["sdk"]}'
+      for metric in obj['benchmarks'][0]['metrics'].keys():
+        measurements = obj['benchmarks'][0]['metrics'][metric]['runs']
+        trace = metric[:-2]  # TODO(yifany): .removesuffix('Ms') w/ python 3.9+
+        data_points.extend([{
+          'duration': measurement,
+          'device': device,
+          'trace': trace,
+          'run_id': run_id
+        } for measurement in measurements])
+  logger.info(f'Extracted {len(data_points)} data points from reports in "{test_reports_dir}"')
+  return data_points
diff --git a/ci/fireci/fireciplugins/macrobenchmark/commands.py b/ci/fireci/fireciplugins/macrobenchmark/commands.py
new file mode 100644
index 00000000000..d431a61b18c
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/commands.py
@@ -0,0 +1,128 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import click
+
+from .analyze import analyzer
+from .run import runner
+from fireci import ci_command
+from pathlib import Path
+from typing import List
+
+
+@ci_command(cls=click.Group)
+def macrobenchmark():
+  """Macrobenchmark testing command group."""
+  pass
+
+
+@click.option(
+  '--build-only',
+  is_flag=True,
+  default=False,
+  show_default=True,
+  help='Build the test projects without running the test.'
+)
+@click.option(
+  '--local/--remote',
+  required=True,
+  help='Run the test on local devices or Firebase Test Lab.'
+)
+@click.option(
+  '--repeat',
+  default=1,
+  show_default=True,
+  help='Number of times to repeat the test (for obtaining more data points).'
+)
+@click.option(
+  '--output',
+  type=click.Path(dir_okay=True, resolve_path=True, path_type=Path),
+  default='macrobenchmark-output.json',
+  show_default=True,
+  help='The file for saving macrobenchmark test output if running on Firebase Test Lab.'
+)
+@ci_command(group=macrobenchmark)
+def run(build_only: bool, local: bool, repeat: int, output: Path):
+  """Run macrobenchmark test."""
+  asyncio.run(runner.start(build_only, local, repeat, output))
+
+
+@click.option(
+  '--diff-mode',
+  is_flag=True,
+  default=False,
+  help='Compare two sets of macrobenchmark result.'
+)
+@click.option(
+  '--ftl-results-dir',
+  multiple=True,
+  help='Firebase Test Lab results directory name. Can be specified multiple times.'
+)
+@click.option(
+  '--local-reports-dir',
+  type=click.Path(dir_okay=True, resolve_path=True, path_type=Path),
+  help='Path to the directory of local test reports.'
+)
+@click.option(
+  '--ctl-ftl-results-dir',
+  multiple=True,
+  help='FTL results dir of the control group, if running in diff mode. '
+       'Can be specified multiple times.'
+)
+@click.option(
+  '--ctl-local-reports-dir',
+  type=click.Path(dir_okay=True, resolve_path=True, path_type=Path),
+  help='Path to the local test reports of the control group, if running in diff mode.'
+)
+@click.option(
+  '--exp-ftl-results-dir',
+  multiple=True,
+  help='FTL results dir of the experimental group, if running in diff mode. '
+       'Can be specified multiple times.'
+)
+@click.option(
+  '--exp-local-reports-dir',
+  type=click.Path(dir_okay=True, resolve_path=True, path_type=Path),
+  help='Path to the local test reports of the experimental group, if running in diff mode.'
+)
+@click.option(
+  '--output-dir',
+  type=click.Path(dir_okay=True, resolve_path=True, path_type=Path),
+  help='The directory for saving macrobenchmark analysis result.'
+)
+@ci_command(group=macrobenchmark)
+def analyze(
+    diff_mode: bool,
+    ftl_results_dir: List[str],
+    local_reports_dir: Path,
+    ctl_ftl_results_dir: List[str],
+    ctl_local_reports_dir: Path,
+    exp_ftl_results_dir: List[str],
+    exp_local_reports_dir: Path,
+    output_dir: Path
+):
+  """Analyze macrobenchmark result."""
+  analyzer.start(
+    diff_mode,
+    ftl_results_dir,
+    local_reports_dir,
+    ctl_ftl_results_dir,
+    ctl_local_reports_dir,
+    exp_ftl_results_dir,
+    exp_local_reports_dir,
+    output_dir,
+  )
+
+# TODO(yifany): support of command chaining
diff --git a/ci/fireci/fireciplugins/macrobenchmark/run/__init__.py b/ci/fireci/fireciplugins/macrobenchmark/run/__init__.py
new file mode 100644
index 00000000000..6d6d1266c32
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/run/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/ci/fireci/fireciplugins/macrobenchmark/run/log_decorator.py b/ci/fireci/fireciplugins/macrobenchmark/run/log_decorator.py
new file mode 100644
index 00000000000..177f5a1a3ba
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/run/log_decorator.py
@@ -0,0 +1,51 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import sys
+
+from logging import Logger, LoggerAdapter
+from typing import Union
+
+
+RESET_CODE = '\x1b[m'
+
+
+class LogDecorator(LoggerAdapter):
+  """Decorates log messages with colors in console output."""
+
+  def __init__(self, logger: Union[Logger, LoggerAdapter], key: str):
+    super().__init__(logger, {})
+    self.key = key
+    self.color_code = self._random_color_code()
+
+  def process(self, msg, kwargs):
+    colored, uncolored = self._produce_prefix()
+    result = f'{colored if sys.stderr.isatty() else uncolored} {msg}'
+    return result, kwargs
+
+  @staticmethod
+  def _random_color_code():
+    code = random.randint(16, 231)  # https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit
+    return f'\x1b[38;5;{code}m'
+
+  def _produce_prefix(self):
+    if hasattr(super(), '_produce_prefix'):
+      colored_super, uncolored_super = getattr(super(), '_produce_prefix')()
+      colored = f'{colored_super} {self.color_code}[{self.key}]{RESET_CODE}'
+      uncolored = f'{uncolored_super} [{self.key}]'
+    else:
+      colored = f'{self.color_code}[{self.key}]{RESET_CODE}'
+      uncolored = f'[{self.key}]'
+    return colored, uncolored
diff --git a/ci/fireci/fireciplugins/macrobenchmark/run/runner.py b/ci/fireci/fireciplugins/macrobenchmark/run/runner.py
new file mode 100644
index 00000000000..af233b8e758
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/run/runner.py
@@ -0,0 +1,101 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import click
+import json
+import logging
+import tempfile
+import yaml
+
+from .test_project_builder import TestProjectBuilder
+from .utils import execute
+from pathlib import Path
+from typing import Dict
+
+
+logger = logging.getLogger('fireci.macrobenchmark')
+
+
+async def start(build_only: bool, local: bool, repeat: int, output: Path):
+  logger.info('Starting macrobenchmark test ...')
+
+  config = _process_config_yaml()
+  product_versions = _assemble_all_products()
+  test_dir = _prepare_test_directory()
+  template_project_dir = Path('health-metrics/benchmark/template')
+
+  test_projects = [
+    TestProjectBuilder(
+      test_config,
+      test_dir,
+      template_project_dir,
+      product_versions,
+    ).build() for test_config in config['test-apps']]
+
+  if not build_only:
+    if local:
+      for test_project in test_projects:
+        test_project.run_local(repeat)
+    else:
+      remote_runs = [test_project.run_remote(repeat) for test_project in test_projects]
+      results = await asyncio.gather(*remote_runs, return_exceptions=True)
+      test_outputs = [x for x in results if not isinstance(x, Exception)]
+      exceptions = [x for x in results if isinstance(x, Exception)]
+
+      with open(output, 'w') as file:
+        json.dump(test_outputs, file)
+        logger.info(f'Output of remote testing saved to: {output}')
+
+      if exceptions:
+        logger.error(f'Exceptions occurred: {exceptions}')
+      for test_output in test_outputs:
+        if test_output['exceptions']:
+          logger.error(f'Exceptions occurred: {test_output["exceptions"]}')
+
+      if exceptions or any(test_output['exceptions'] for test_output in test_outputs):
+        raise click.ClickException('Macrobenchmark test failed with above exceptions')
+
+  logger.info(f'Completed macrobenchmark test successfully')
+
+
+def _assemble_all_products() -> Dict[str, str]:
+  execute('./gradlew', 'assembleAllForSmokeTests', logger=logger)
+
+  product_versions: Dict[str, str] = {}
+  with open('build/m2repository/changed-artifacts.json') as json_file:
+    artifacts = json.load(json_file)
+    for artifact in artifacts['headGit']:
+      group_id, artifact_id, version = artifact.split(':')
+      product_versions[f'{group_id}:{artifact_id}'] = version
+
+  logger.info(f'Product versions: {product_versions}')
+  return product_versions
+
+
+def _process_config_yaml():
+  with open('health-metrics/benchmark/config.yaml') as yaml_file:
+    config = yaml.safe_load(yaml_file)
+    for app in config['test-apps']:
+      app['plugins'] = app.get('plugins', [])
+      app['traces'] = app.get('traces', [])
+      app['plugins'].extend(config['common-plugins'])
+      app['traces'].extend(config['common-traces'])
+    return config
+
+
+def _prepare_test_directory() -> Path:
+  test_dir = tempfile.mkdtemp(prefix='benchmark-test-')
+  logger.info(f'Temporary test directory created at: {test_dir}')
+  return Path(test_dir)
diff --git a/ci/fireci/fireciplugins/macrobenchmark/run/test_project.py b/ci/fireci/fireciplugins/macrobenchmark/run/test_project.py
new file mode 100644
index 00000000000..9a7bb22befd
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/run/test_project.py
@@ -0,0 +1,107 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import glob
+import re
+import shutil
+
+from .log_decorator import LogDecorator
+from .utils import execute, execute_async, generate_test_run_id
+from fireci.dir_utils import chdir
+from logging import getLogger, Logger, LoggerAdapter
+from pathlib import Path
+from typing import List, TypedDict, Union
+
+logger = getLogger('fireci.macrobenchmark')
+
+
+class RemoteTestOutput(TypedDict, total=False):
+  project: str
+  successful_runs: List[str]
+  exceptions: List[str]  # Using str due to Exception being not JSON serializable
+
+
+class TestProject:
+  def __init__(self, name: str, project_dir: Path, custom_logger: Union[Logger, LoggerAdapter]):
+    self.name = name
+    self.test_project_dir = project_dir
+    self.logger = custom_logger
+
+  def run_local(self, repeat: int):
+    self.logger.info(f'Running test locally for {repeat} times ...')
+    local_reports_dir = self.test_project_dir.joinpath('_reports')
+
+    with chdir(self.test_project_dir):
+      for index in range(repeat):
+        run_id = generate_test_run_id()
+        run_logger = LogDecorator(self.logger, f'run-{index}')
+        run_logger.info(f'Run-{index}: {run_id}')
+        execute('./gradlew', ':macrobenchmark:connectedCheck', logger=run_logger)
+
+        reports = self.test_project_dir.rglob('build/**/*-benchmarkData.json')
+        run_dir = local_reports_dir.joinpath(run_id)
+        for report in reports:
+          device = re.search(r'benchmark/connected/([^/]*)/', str(report)).group(1)
+          device_dir = run_dir.joinpath(device)
+          device_dir.mkdir(parents=True, exist_ok=True)
+          shutil.copy(report, device_dir)
+          run_logger.debug(f'Copied report file "{report}" to "{device_dir}"')
+
+    self.logger.info(f'Finished all {repeat} runs, local reports dir: "{local_reports_dir}"')
+
+  async def run_remote(self, repeat: int) -> RemoteTestOutput:
+    self.logger.info(f'Running test remotely for {repeat} times ...')
+
+    with chdir(self.test_project_dir):
+      await execute_async('./gradlew', 'assemble', logger=self.logger)
+      app_apk_path = glob.glob('**/app-benchmark.apk', recursive=True)[0]
+      test_apk_path = glob.glob('**/macrobenchmark-benchmark.apk', recursive=True)[0]
+      self.logger.info(f'App apk: "{app_apk_path}", Test apk: "{test_apk_path}"')
+
+      async def run(index: int, run_id: str) -> str:
+        run_logger = LogDecorator(self.logger, f'run-{index}')
+        run_logger.info(f'Run-{index}: {run_id}')
+        ftl_environment_variables = [
+          'clearPackageData=true',
+          'additionalTestOutputDir=/sdcard/Download',
+          'no-isolated-storage=true',
+        ]
+        executable = 'gcloud'
+        args = ['firebase', 'test', 'android', 'run']
+        args += ['--type', 'instrumentation']
+        args += ['--app', app_apk_path]
+        args += ['--test', test_apk_path]
+        args += ['--device', 'model=oriole,version=32,locale=en,orientation=portrait']
+        args += ['--directories-to-pull', '/sdcard/Download']
+        args += ['--results-bucket', 'fireescape-benchmark-results']
+        args += ['--results-dir', run_id]
+        args += ['--environment-variables', ','.join(ftl_environment_variables)]
+        args += ['--timeout', '30m']
+        args += ['--project', 'fireescape-c4819']
+        await execute_async(executable, *args, logger=run_logger)
+        return run_id
+
+      runs = [run(i, generate_test_run_id()) for i in range(repeat)]
+      results = await asyncio.gather(*runs, return_exceptions=True)
+      successes = [x for x in results if not isinstance(x, Exception)]
+      exceptions = [x for x in results if isinstance(x, Exception)]
+
+    self.logger.info(f'Finished all {repeat} runs, successes: {successes}, failures: {exceptions}')
+
+    return RemoteTestOutput(
+      project=self.name,
+      successful_runs=successes,
+      exceptions=[str(e) for e in exceptions]
+    )
diff --git a/ci/fireci/fireciplugins/macrobenchmark/run/test_project_builder.py b/ci/fireci/fireciplugins/macrobenchmark/run/test_project_builder.py
new file mode 100644
index 00000000000..6e6dd6d2a14
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/run/test_project_builder.py
@@ -0,0 +1,89 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import pystache
+import shutil
+
+from .log_decorator import LogDecorator
+from .test_project import TestProject
+from .utils import execute
+from pathlib import Path
+from typing import Any, Dict
+
+
+logger = logging.getLogger('fireci.macrobenchmark')
+
+
+class TestProjectBuilder:
+  def __init__(
+      self,
+      test_config: Any,
+      test_dir: Path,
+      template_project_dir: Path,
+      product_versions: Dict[str, str]
+  ):
+    self.test_config = test_config
+    self.template_project_dir = template_project_dir
+    self.product_versions = product_versions
+
+    self.name = test_config['name']
+    self.logger = LogDecorator(logger, self.name)
+    self.project_dir = test_dir.joinpath(self.name)
+
+  def build(self) -> TestProject:
+    self.logger.info(f'Creating test project "{self.name}" ...')
+
+    self._copy_template_project()
+    self._flesh_out_mustache_template_files()
+    self._download_gradle_wrapper()
+
+    self.logger.info(f'Test project "{self.name}" created at "{self.project_dir}"')
+    return TestProject(self.name, self.project_dir, self.logger)
+
+  def _copy_template_project(self):
+    shutil.copytree(self.template_project_dir, self.project_dir)
+    self.logger.debug(f'Copied project template files into "{self.project_dir}"')
+
+  def _download_gradle_wrapper(self):
+    args = ['wrapper', '--gradle-version', '7.5.1', '--project-dir', str(self.project_dir)]
+    execute('./gradlew', *args, logger=self.logger)
+    self.logger.debug(f'Created gradle wrapper in "{self.project_dir}"')
+
+  def _flesh_out_mustache_template_files(self):
+    mustache_context = {
+      'm2repository': os.path.abspath('build/m2repository'),
+      'plugins': self.test_config.get('plugins', []),
+      'traces': self.test_config.get('traces', []),
+      'dependencies': [],
+    }
+
+    if 'dependencies' in self.test_config:
+      for dep in self.test_config['dependencies']:
+        if '@' in dep:
+          key, version = dep.split('@', 1)
+          dependency = {'key': key, 'version': version}
+        else:
+          dependency = {'key': dep, 'version': self.product_versions[dep]}
+        mustache_context['dependencies'].append(dependency)
+
+    renderer = pystache.Renderer()
+    mustaches = self.project_dir.rglob('**/*.mustache')
+    for mustache in mustaches:
+      self.logger.debug(f'Processing template file: {mustache}')
+      result = renderer.render_path(mustache, mustache_context)
+      original_name = str(mustache)[:-9]  # TODO(yifany): .removesuffix('.mustache') w/ python 3.9+
+      with open(original_name, 'w') as file:
+        file.write(result)
diff --git a/ci/fireci/fireciplugins/macrobenchmark/run/utils.py b/ci/fireci/fireciplugins/macrobenchmark/run/utils.py
new file mode 100644
index 00000000000..32e90193438
--- /dev/null
+++ b/ci/fireci/fireciplugins/macrobenchmark/run/utils.py
@@ -0,0 +1,65 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import string
+import random
+
+from asyncio import create_subprocess_exec
+from asyncio.subprocess import PIPE as ASYNC_PIPE, STDOUT as ASYNC_STDOUT
+from logging import Logger, LoggerAdapter
+from subprocess import Popen, PIPE, STDOUT
+from typing import Union
+
+
+def generate_test_run_id() -> str:
+  now = datetime.datetime.now()
+  date = now.date()
+  time = now.time()
+  name = ''.join(random.choices(string.ascii_letters, k=4))
+  return f'{date}_{time}_{name}'
+
+
+def execute(program: str, *args: str, logger: Union[Logger, LoggerAdapter]) -> None:
+  command = " ".join([program, *args])
+  logger.info(f'Executing subprocess: "{command}" ...')
+
+  popen = Popen([program, *args], stdout=PIPE, stderr=STDOUT)
+  for line in popen.stdout:
+    logger.info(f'[{program}] {line.decode("utf-8").strip()}')
+  popen.communicate()
+
+  if popen.returncode == 0:
+    logger.info(f'"{command}" succeeded')
+  else:
+    message = f'"{command}" failed with return code {popen.returncode}'
+    logger.error(message)
+    raise RuntimeError(message)
+
+
+async def execute_async(program: str, *args: str, logger: Union[Logger, LoggerAdapter]) -> None:
+  command = " ".join([program, *args])
+  logger.info(f'Executing subprocess: "{command}" ...')
+
+  process = await create_subprocess_exec(program, *args, stdout=ASYNC_PIPE, stderr=ASYNC_STDOUT)
+  async for line in process.stdout:
+    logger.info(f'[{program}] {line.decode("utf-8").strip()}')
+  await process.communicate()
+
+  if process.returncode == 0:
+    logger.info(f'"{command}" succeeded')
+  else:
+    message = f'"{command}" failed with return code {process.returncode}'
+    logger.error(message)
+    raise RuntimeError(message)
diff --git a/ci/fireci/setup.cfg b/ci/fireci/setup.cfg
index 4bc55ca8ea5..1237d1a6af9 100644
--- a/ci/fireci/setup.cfg
+++ b/ci/fireci/setup.cfg
@@ -5,12 +5,15 @@ version = 0.1
 [options]
 install_requires =
         protobuf==3.19
-        click==7.0
-        google-cloud-storage==1.44.0
+        click==8.1.3
+        google-cloud-storage==2.5.0
+        mypy==0.991
         numpy==1.23.1
+        pandas==1.5.1
         PyGithub==1.55
         pystache==0.6.0
         requests==2.23.0
+        seaborn==0.12.1
         PyYAML==6.0.0
 
 [options.extras_require]
@@ -20,3 +23,18 @@ test =
 [options.entry_points]
 console_scripts =
     fireci = fireci.main:cli
+
+[mypy]
+strict_optional = False
+[mypy-google.cloud]
+ignore_missing_imports = True
+[mypy-pandas]
+ignore_missing_imports = True
+[mypy-pystache]
+ignore_missing_imports = True
+[mypy-requests]
+ignore_missing_imports = True
+[mypy-seaborn]
+ignore_missing_imports = True
+[mypy-yaml]
+ignore_missing_imports = True
diff --git a/health-metrics/benchmark/README.md b/health-metrics/benchmark/README.md
index 5cffaf1532a..3cd25e9617b 100644
--- a/health-metrics/benchmark/README.md
+++ b/health-metrics/benchmark/README.md
@@ -10,7 +10,7 @@ building a macrobenchmark test app for each of the Firebase Android SDKs.
 If not all of them are required, comment out irrelevant ones for faster build
 and test time.
 
-## Run benchmark tests
+## Run macrobenchmark tests
 
 ### Prerequisite
 
@@ -35,16 +35,27 @@ and test time.
    [doc](https://cloud.google.com/docs/authentication) for full guidance on
    authentication.
 
-### Run benchmark tests locally
+### Run tests locally
 
-1. Build all test apps by running below command in the root
-   directory `firebase-android-sdk`:
+1. [Connect an Android device to the computer](https://d.android.com/studio/run/device)
+
+1. Run below command in the repository root directory `firebase-android-sdk`:
 
    ```shell
-   fireci macrobenchmark --build-only
+   fireci macrobenchmark run --local
    ```
 
-1. [Connect an Android device to the computer](https://d.android.com/studio/run/device)
+   **Note**: specify `--repeat <number>` to run the test multiple times. Run
+   `fireci macrobenchmark run --help` to see more details.
+
+Alternatively, developers can also create test apps with `fireci`, and run the
+test from either CLI or Android Studio:
+
+1. Run below command to build all test apps:
+
+   ```shell
+   fireci macrobenchmark run --build-only
+   ```
 
 1. Locate the temporary test apps directory from the log, for example:
 
@@ -89,23 +100,90 @@ and test time.
      Alternatively, same set of result files are produced at the same output
      location as invoking tests from CLI, which can be used for inspection.
 
-### Run benchmark tests on Firebase Test Lab
+### Run tests on Firebase Test Lab
 
-Build and run all tests on FTL by running below command in the root
-directory `firebase-android-sdk`:
+Run below command to build and run all tests on FTL:
 
+```shell
+fireci macrobenchmark run --remote
 ```
-fireci macrobenchmark
-```
 
-Alternatively, it is possible to build all test apps via steps described in
-[Running benchmark tests locally](#running-benchmark-tests-locally)
-and manually
-[run tests on FTL with `gcloud` CLI ](https://firebase.google.com/docs/test-lab/android/command-line#running_your_instrumentation_tests).
+**Note**: `--repeat <number>` is also supported to submit the test to FTL for
+`<number>` times. All tests on FTL will run in parallel.
+
+Alternatively, developers can still build test apps locally, and manually
+[run tests on FTL with `gcloud` CLI](https://firebase.google.com/docs/test-lab/android/command-line#running_your_instrumentation_tests).
 
 Aggregated benchmark results are displayed in the log. The log also
 contains links to FTL result pages and result files on Google Cloud Storage.
 
+## Analyze macrobenchmark results
+
+Besides results from `*-benchmarkData.json` as descriped above, `fireci`
+supports more in depth analysis, such as:
+
+- calculating percentiles and visualizing distributions for one test run
+- comparing two sets of results (with stats and graphs) from two different runs
+
+To see more details, run
+
+```shell
+fireci macrobenchmark analyze --help
+```
+
+### Example usage
+
+1. Analyzing local test results
+
+   ```shell
+   fireci macrobenchmark analyze --local-reports-dir <path-to-dir>
+   ```
+
+   `<path-to-dir>` is the directory containing the `*-benchmarkData.json` from
+   the local test runs.
+
+   **Note**: If the test is started:
+
+   - with `fireci macrobenchmark run --local`, `fireci` copies all benchmark
+     json files into a dir, which can be supplied here.
+   - manually (CLI or Android Studio), `<path-to-dir>` shall be the directory
+     that contains `*-benchmarkData.json` in the gradle build directory.
+
+1. Analyzing remote test results
+
+   ```shell
+   fireci macrobenchmark analyze --ftl-results-dir <dir1> --ftl-results-dir <dir2> ...
+   ```
+
+   `<dir1>`, `<dir2>` are Firebase Test Lab results directory names, such as
+   `2022-11-04_11:18:34.039437_OqZn`.
+
+1. Comparing two sets of result from two different FTL runs
+
+   ```shell
+   fireci macrobenchmark analyze \
+     --diff-mode \
+     --ctl-ftl-results-dir <dir1-from-run1> \
+     --ctl-ftl-results-dir <dir2-from-run1> \
+     ...
+     --exp-ftl-results-dir <dir1-from-run2> \
+     --exp-ftl-results-dir <dir2-from-run2> \
+     ...
+   ```
+
+   `ctl` and `exp` are short for "control group" and "experimental group".
+
+1. Comparing a local test run against a FTL run
+
+   ```shell
+   fireci macrobenchmark analyze \
+     --diff-mode \
+     --ctl-ftl-results-dir <dir1-from-ftl-run> \
+     --ctl-ftl-results-dir <dir2-from-ftl-run> \
+     ...
+     --exp-local-reports-dir <dir-from-local-run>
+   ```
+
 ## Toolchains
 
 - Gradle 7.5.1
diff --git a/health-metrics/benchmark/config.yaml b/health-metrics/benchmark/config.yaml
index 8852965302e..6a8bc2a0a27 100644
--- a/health-metrics/benchmark/config.yaml
+++ b/health-metrics/benchmark/config.yaml
@@ -21,52 +21,41 @@ common-plugins: [com.google.gms.google-services]
 common-traces: [Firebase, ComponentDiscovery, Runtime]
 
 test-apps:
-  - sdk: firebase-config
-    name: config
-    dependencies: [com.google.firebase:firebase-config-ktx]
-  - sdk: firebase-common
-    name: common
-    dependencies: [com.google.firebase:firebase-common]
-  - sdk: firebase-crashlytics
-    name: crash
-    dependencies: [com.google.firebase:firebase-crashlytics-ktx]
-    plugins: [com.google.firebase.crashlytics]
-  - sdk: firebase-database
-    name: database
-    dependencies: [com.google.firebase:firebase-database-ktx]
-  - sdk: firebase-dynamic-links
-    name: fdl
-    dependencies: [com.google.firebase:firebase-dynamic-links-ktx]
-  - sdk: firebase-firestore
-    name: firestore
-    dependencies: [com.google.firebase:firebase-firestore-ktx]
-  - sdk: firebase-functions
-    name: functions
-    dependencies: [com.google.firebase:firebase-functions-ktx]
-  # TODO(yifany): disable temporarily due to errors of duplicate class and gradle crash
-  #  - sdk: firebase-inappmessaging-display
-  #    name: fiam
-  #    dependencies:
-  #      - com.google.firebase:firebase-analytics-ktx@18.0.3
-  #      - com.google.firebase:firebase-inappmessaging-ktx
-  #      - com.google.firebase:firebase-inappmessaging-display-ktx
-  - sdk: firebase-messaging
-    name: message
-    dependencies: [com.google.firebase:firebase-messaging-ktx]
-  - sdk: firebase-perf
-    name: perf
-    dependencies: [com.google.firebase:firebase-perf-ktx]
-    plugins: [com.google.firebase.firebase-perf]
-  - sdk: firebase-storage
-    name: stroage
-    dependencies: [com.google.firebase:firebase-storage-ktx]
-
-
-# TODO(yifany): google3 sdks, customizing FTL devices
-# auth
-# analytics
-# combined
-#   - crashlytics + analytics
-#   - crashlytics + fireperf
-#   - auth + firestore
-#   - ...
+  - sdk: N.A.
+    name: all-included
+    dependencies:
+      - com.google.firebase:firebase-abt
+      - com.google.firebase:firebase-appcheck
+      - com.google.firebase:firebase-appdistribution
+      - com.google.firebase:firebase-crashlytics
+      - com.google.firebase:firebase-database
+      - com.google.firebase:firebase-dynamic-links
+      - com.google.firebase:firebase-firestore
+      - com.google.firebase:firebase-functions
+      - com.google.firebase:firebase-inappmessaging
+      - com.google.firebase:firebase-inappmessaging-display
+      - com.google.firebase:firebase-messaging
+      - com.google.firebase:firebase-ml-modeldownloader
+      - com.google.firebase:firebase-perf
+      - com.google.firebase:firebase-storage
+    plugins:
+      - com.google.firebase.crashlytics
+      - com.google.firebase.firebase-perf
+    traces:
+      - fire-abt
+      - fire-app-check
+      - fire-appdistribution
+      - fire-cls
+      - fire-dl
+      - fire-fcm
+      - fire-fiam
+      - fire-fiamd
+      - fire-fn
+      - fire-fst
+      - fire-gcs
+      - fire-installations
+      - firebase-ml-modeldownloader
+      - fire-perf
+      - fire-rc
+      - fire-rtdb
+      - fire-transport
diff --git a/health-metrics/benchmark/template/macrobenchmark/src/main/java/com/google/firebase/macrobenchmark/BenchmarkTest.kt.mustache b/health-metrics/benchmark/template/macrobenchmark/src/main/java/com/google/firebase/macrobenchmark/BenchmarkTest.kt.mustache
index 4fa5af3546e..82dd0ecbf3b 100644
--- a/health-metrics/benchmark/template/macrobenchmark/src/main/java/com/google/firebase/macrobenchmark/BenchmarkTest.kt.mustache
+++ b/health-metrics/benchmark/template/macrobenchmark/src/main/java/com/google/firebase/macrobenchmark/BenchmarkTest.kt.mustache
@@ -39,7 +39,7 @@ class StartupBenchmark {
       TraceSectionMetric("{{.}}"),
       {{/traces}}
     ),
-    iterations = 5,
+    iterations = 100,
     startupMode = StartupMode.COLD
   ) {
     pressHome()