我正在尝试使用大数据集上的类型记录来实现秩/密集等级功能。是否有库函数或在铸造打字本中实现此功能的简单方法。
发布于 2022-07-06 15:45:07
如果您想获得TypeScript中对象的秩或密集秩,可以为对象集或特定类型的所有对象实现一个秩函数,如下所示:
import { Function, FunctionsMap, Integer, OntologyObject } from "@foundry/functions-api";
import { Objects, ExampleDataFlight, ObjectSet } from "@foundry/ontology-api";
export class MyFunctions {
@Function()
public async rankSetOfFlights(flightSet: ObjectSet<ExampleDataFlight>): Promise<FunctionsMap<ExampleDataFlight, Integer>> {
const flights = await flightSet.allAsync()
return rank(flights, compareFlight)
}
@Function()
public async rankAllFlights(): Promise<FunctionsMap<ExampleDataFlight, Integer>> {
const flights = await Objects.search().exampleDataFlight().allAsync()
return rank(flights, compareFlight)
}
}
// A comparison function, as per https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/sort
const compareFlight = (a: ExampleDataFlight, b: ExampleDataFlight): number =>
(a.date ?? Infinity).valueOf() - (b.date ?? Infinity).valueOf();
/**
* Creates a FunctionsMap from an object to its (sparse) rank or dense rank, for a given comparison function.
*
* Example call 1:
* rank(
* [{ value: 10 }, { value: 15 }, { value: 15 }, { value: 20 }],
* (a, b) => a.value - b.value,
* 'sparse',
* )
*
* Example output 1:
* Map<[
* { value: 10 } -> 1,
* { value: 15 } -> 2,
* { value: 15 } -> 2,
* { value: 20 } -> 4,
* ]>
*
* Example call 2:
* rank(
* [{ value: 10 }, { value: 15 }, { value: 15 }, { value: 20 }],
* (a, b) => a.value - b.value,
* 'dense',
* )
*
* Example output 2:
* Map<[
* { value: 10 } -> 1,
* { value: 15 } -> 2,
* { value: 15 } -> 2,
* { value: 20 } -> 3,
* ]>
*/
const rank = <T extends OntologyObject>(objs: T[], compareFn: (a: T, b: T) => number, how: 'sparse' | 'dense' = 'sparse'): FunctionsMap<T, Integer> => {
const map = new FunctionsMap<T, Integer>();
if (objs.length === 0) return map;
// Sort the objects, so we can iterate through them in order
const sortedObjs = objs.sort(compareFn)
// Iterate through the sorted objects, keeping track of the current rank
let rank = 1;
sortedObjs.forEach((obj, i) => {
// Increase the rank when the current object is greater than the last one
if (i >= 1 && compareFn(obj, sortedObjs[i - 1]) > 0) {
if (how === 'sparse') rank = i;
if (how === 'dense') rank++;
}
// Set the rank for the object in the map
map.set(obj, rank)
})
return map;
}这对于较小的数据集可能很好,目前Foundry将限制您在大多数情况下在100000个对象上运行它。您可以尝试过滤您的对象集(例如,在Quiver或讲习班中),然后将其传递给函数,以帮助解决这个问题。
您在问题中提到,这是针对大量数据的。对于较大的数据集,最好在转换中使用内置的火花等级和密集秩函数,例如在代码存储库中。要做到这一点,这样的转换可能会有所帮助:
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
from transforms.api import transform_df, Input, Output
@transform_df(
Output("/path/to/flights_ranked"),
source_df=Input("/path/to/flights"),
)
def compute(source_df):
return (
source_df
# (you can also use .partitionBy() on the window definition)
.withColumn("rank", F.rank().over(W.orderBy("date")))
.withColumn("dense_rank", F.dense_rank().over(W.orderBy("date")))
)https://stackoverflow.com/questions/72216096
复制相似问题