我正在试着阅读一个大的JSONL,可能是几百到数千,甚至百万行,下面是数据的样本。
{"id":"gid://shopify/Product/1921569226808"}
{"id":"gid://shopify/ProductVariant/19435458986040","__parentId":"gid://shopify/Product/1921569226808"}
{"id":"gid://shopify/Product/1921569259576"}
{"id":"gid://shopify/ProductVariant/19435459018808","__parentId":"gid://shopify/Product/1921569259576"}
{"id":"gid://shopify/Product/1921569292344"}
{"id":"gid://shopify/ProductVariant/19435459051576","__parentId":"gid://shopify/Product/1921569292344"}
{"id":"gid://shopify/Product/1921569325112"}
{"id":"gid://shopify/ProductVariant/19435459084344","__parentId":"gid://shopify/Product/1921569325112"}
{"id":"gid://shopify/Product/1921569357880"}
{"id":"gid://shopify/ProductVariant/19435459117112","__parentId":"gid://shopify/Product/1921569357880"}
{"id":"gid://shopify/ProductVariant/19435458986123","__parentId":"gid://shopify/Product/1921569226808"}因此,每一行都是json对象,或者是它的产品,或者是由__parentId标识的产品子对象,考虑到数据可能包含数千行,那么读取它并返回常规JSON对象的最佳方法是这样的。
{
"id": "gid://shopify/Product/1921569226808",
"childrens": {
{"id":"gid://shopify//ProductImage//20771195224224","__parentId":"gid:////shopify//Product//1921569226808"},
{"id":"gid:////shopify//ProductImage//20771195344224","__parentId":"gid:////shopify//Product//1921569226808"}
{"id":"gid:////shopify//ProductImage//20771329344224","__parentId":"gid:////shopify//Product//1921569226808"}
}}
这些数据是从Shopify回来的,他们建议:
由于嵌套连接不再嵌套在响应数据结构中,因此结果包含__parentId字段,该字段是对对象的父对象的引用。这个字段在API模式中不存在,所以您不能显式地查询它。它自动包含在批量操作结果中。 在反向读取JSONL文件时读取JSONL文件,反向读取JSONL文件使分组子节点更容易,并且避免了在父节点之后出现的任何丢失。例如,在收集变体时,当您访问变体所属的产品时,文件中不会有更多的变体。下载JSONL文件后,反向读取它,然后解析它,以便在发现父节点之前跟踪任何子节点。
您可以在这里查找更多有关这个在这里输入链接描述的内容。
发布于 2020-12-28 23:53:27
考虑使用流,这样就不必在内存中加载整个文件。
您可以使用readline (一个本机模块)单独处理每一行。
我从@terrymorse https://stackoverflow.com/a/65484413/14793527获得了行处理部分
const readline = require('readline');
const fs = require('fs');
let res = {};
function processLine(line) {
const {id, __parentId} = line;
// if there is no `__parentId`, this is a parent
if (typeof __parentId === 'undefined') {
res[line.id] = {
id,
childrens: []
};
return res;
}
// this is a child, create its parent if necessary
if (typeof res[__parentId] === 'undefined') {
res[__parentId] = {
id: __parentId,
childrens: []
}
}
// add child to parent's children
res[__parentId].childrens.push(line);
return res;
}
const readInterface = readline.createInterface({
input: fs.createReadStream('large.jsonl'),
output: process.stdout,
console: false
});
readInterface.on('line', processLine);
readInterface.on('close', function() {
const resultArray = Object.values(res);
console.log(resultArray);
});发布于 2020-12-28 21:36:12
以下是一种技巧:
(为了简单起见,将输入行转换为数组)
const lines = [
{ "id": "gid://shopify/Product/1921569226808" },
{ "id": "gid://shopify/ProductVariant/19435458986040", "__parentId": "gid://shopify/Product/1921569226808" },
{ "id": "gid://shopify/Product/1921569259576" },
{ "id": "gid://shopify/ProductVariant/19435459018808", "__parentId": "gid://shopify/Product/1921569259576" },
{ "id": "gid://shopify/Product/1921569292344" },
{ "id": "gid://shopify/ProductVariant/19435459051576", "__parentId": "gid://shopify/Product/1921569292344" },
{ "id": "gid://shopify/Product/1921569325112" },
{ "id": "gid://shopify/ProductVariant/19435459084344", "__parentId": "gid://shopify/Product/1921569325112" },
{ "id": "gid://shopify/Product/1921569357880" },
{ "id": "gid://shopify/ProductVariant/19435459117112", "__parentId": "gid://shopify/Product/1921569357880" },
{ "id": "gid://shopify/ProductVariant/19435458986123", "__parentId": "gid://shopify/Product/1921569226808" }
];
// form object keyed to parent ids
const result = lines.reduce((res, line) => {
const {id, __parentId} = line;
// if there is no `__parentId`, this is a parent
if (typeof __parentId === 'undefined') {
res[id] = {
id,
childrens: []
};
return res;
}
// this is a child, create its parent if necessary
if (typeof res[__parentId] === 'undefined') {
res[__parentId] = {
id: __parentId,
childrens: []
}
}
// add child to parent's children
res[__parentId].childrens.push(line);
return res;
}, {});
// convert object to array
const resultArray = Object.values(result);
const pre = document.querySelector('pre');
pre.innerText = 'resultArray: ' + JSON.stringify(resultArray, null, 2);<pre></pre>
https://stackoverflow.com/questions/65484128
复制相似问题