使用 Foundry API,您如何获取数据集的数量或行和列?

问题描述

我希望使用 Foundry 中的 API 检索数据集中的记录和列数。我发现似乎显示记录数的 API 之一是 “.../monocle/api/table/stats”,但是我不知道如何通过数据集的rid。

任何帮助都会很棒。我最终尝试获取我管理的所有数据集的总列数、记录和大小,以便使用 Quiver 或 Slate 构建仪表板来显示我们在 Foundry 平台内管理的数据量。

解决方法

您可以使用以下示例代码来计算数据集的统计量:

import time
import requests
from urllib.parse import quote_plus
import json

def calculate_dataset_stats(token: str,dataset_rid: str,branch='master',api_base='https://foundry-stack.com'
                            ) -> dict:
    """
    Calculates statistics for last transaction of a dataset in a branch
    Args:
        dataset_rid: the dataset rid
        branch: branch of the dataset

    Returns: a dictionary with statistics

    """
    start_stats_calculation = requests.post(f"{api_base}/foundry-stats/api/stats/datasets/"
                                            f"{dataset_rid}/branches/{quote_plus(branch)}",headers={
                                                'content-type': "application/json",'authorization': f"Bearer {token}",})
    start_stats_calculation.raise_for_status()
    metadata = start_stats_calculation.json()
    transaction_rid = metadata['view']['endTransactionRid']
    schema_id = metadata['view']['schemaId']

    calculated_finished = False
    maybe_stats = {
        'status': 'FAILED'
    }

    while not calculated_finished:
        response = requests.get(f"{api_base}/foundry-stats/api/stats/datasets/"
                                f"{dataset_rid}/branches/{quote_plus(branch)}",headers={
                                    'content-type': "application/json",},params={
                                    'endTransactionRid': transaction_rid,'schemaId': schema_id
                                })
        response.raise_for_status()
        maybe_stats = response.json()
        if (maybe_stats['status'] == 'SUCCEEDED') or (maybe_stats['status'] == 'FAILED'):
            calculated_finished = True
        time.sleep(0.5)

    if maybe_stats['status'] != 'SUCCEEDED':
        raise ValueError(f'Stats Calculation failed for dataset {dataset_rid}. '
                         f'Failure handling not implemented.')

    return maybe_stats['result']['succeededDatasetResult']['stats']


token = "eyJwb..."
dataset_rid = "ri.foundry.main.dataset.14703427-09ab-4c9c-b036-1234b34d150b"
stats = calculate_dataset_stats(token,dataset_rid)

print(json.dumps(stats,indent=4))