这是我对FPGrowth算法的实现,作为一种优化,我避免在前缀的每个扩展处重新创建树,同时使用视图表示,我认为这种表示更有内存效率。如果有其他方法来改进当前的代码,请告诉我。如有任何反馈,将不胜感激。
#include <iostream>
#include <map>
#include <unordered_set>
#include <vector>
#include <unordered_map>
#include <deque>
#include <queue>
#import <cassert>
#include <stack>
/**
* Trie node required by the fpgrowth algorithm
* @tparam T
*/
template <typename T> struct fpgrowth_node {
size_t count, // Number of instances being observed
tmp_count, // Temporary count while pruning the graph: instead of creating another fpgrowth tree, I'll use views to accelerate the task
height; // height inducing the visiting order from the leaves
const T* value; // Pointer to the actual value: this is done so to alleviate the memory from having multiple copies of the object. (the actual singleton will reside in the main fpgrowth table)
struct fpgrowth_node* parent; // Root for the current node (if any)
std::unordered_map<const T*, struct fpgrowth_node*> children; // Children mapped by key value
struct fpgrowth_node* list; // single linked list connecting all the nodes associated to the same object
// bool current_leaf; // whether this element is currently a leaf
fpgrowth_node(const T* k = nullptr) : value{k}, count{1}, tmp_count{0}, height{0}, parent{nullptr}, children{}, list{nullptr} {}
/**
* Adding a child to the curent node or, if it already exists, incrementing the counter for the existing one
* @param value Value associated to the child
* @return Either the already-existing child or the newly allocated one
*/
struct fpgrowth_node* add_child(const T* value) {
auto it = children.emplace(value, nullptr);
if (it.second) { // If I am adding a new child
it.first->second = new fpgrowth_node(value); // Allocating the new child
it.first->second->parent = this; // Setting the parent to the current node
it.first->second->height = height+1; // Increasing the height, so to exploit topological sort for efficiently scanning the tree from the leaves
} else {
it.first->second->height = std::max(height+1,it.first->second->height); // Otherwise, update the nodes' depth depending on my current one
it.first->second->count++; // If the child was already there, increment its count
}
return it.first->second; // returning the child already being created in the trie
}
~fpgrowth_node() {
for (const auto& [k,v] : children) delete v; // deleting all of the allocated children so to save memory and avoid memory leaks
}
};
template <typename T>
static inline void fpgrowth_expand(struct fpgrowth_node<T>* tree,
const T* current,
size_t curr_support,
std::unordered_set<const T*> prefix,
const std::unordered_set<struct fpgrowth_node<T>*>& view,
std::unordered_map<const T*, struct fpgrowth_node<T>*>& traverse,
std::vector<std::pair<size_t, std::unordered_set<const T*>>>& results,
size_t minsupport = 1) {
// std::cout << "Step: ";
// for (const auto& ptr :prefix)
// std::cout << *ptr << " ";
// std::cout << *current << std::endl;
std::unordered_set<struct fpgrowth_node<T>*> visitable, // Whether the element was already visited in a previous iteration
in_tree // Whether the node, for its good support, might be part of the next pruning and therefore part of the view
;
auto comp = []( struct fpgrowth_node<T>* a, struct fpgrowth_node<T>* b ) { return a->height < b->height; }; // As a tree is a DAG, I can exploit the topological order to know in which order extract the nodes from the queue
std::priority_queue<struct fpgrowth_node<T>*, std::vector<struct fpgrowth_node<T>*>, decltype(comp)> dq; // Priority queue for storing all the nodes, being sorted by order of visit
auto it = traverse.find(current); // Checking whether we are allowed to traverse from this node
std::unordered_map<const T*, struct fpgrowth_node<T>*> new_firsttraverse; // Updating the first_to_traverse table depending on the nodes being available
std::unordered_map<const T*, size_t> toTraverseNext; // Updating the support table while keeping the support of the previous nodes
std::vector<struct fpgrowth_node<T>*> visitedChild; // For each non-leaf node, keeping all the childs that were visited within the view
if (it != traverse.end()) {
{
struct fpgrowth_node<T>* ptr = it->second;
while (ptr) {
if (view.contains(ptr)) { // If the current element in the linked list is actually part of the view
dq.emplace(ptr);
}
ptr = ptr->list; // Scanning all the elements in the list
}
}
while (!dq.empty()) {
struct fpgrowth_node<T>* dq_ptr = dq.top(); // Next element to be visited
dq.pop();
if (!visitable.emplace(dq_ptr).second) continue; // Discarding visiting the node if visited already
{
dq_ptr->tmp_count = 0; // Re-setting the counter to zero
// dq_ptr->current_leaf = true;
bool isLeaf = (dq_ptr->value == current);
bool goodToTest = false;
if (isLeaf) {
dq_ptr->tmp_count = dq_ptr->count;// As this is the leaf, it needs to hold the tmp_count for the visit its own support count.
// dq_ptr->current_leaf = true;// Setting the current node as leaf, from which start the visit (from the bottom!)
goodToTest = true; // Yes, I can proceed with the visit
} else {
size_t totalChild = 0; // Counting the number of allowed children to be visited within the view as in (A)
visitedChild.clear(); // Clearing the set of the previous children
for (const auto& [k,v] : dq_ptr->children) {
if (view.contains(v)) {
totalChild++; // (A)
if (visitable.contains(v)) {
visitedChild.emplace_back(v); // Yes, this child has been visited in a previous iteration
}
}
}
if (totalChild == visitedChild.size()) { // I can do the counting only if I have already visited all of the childs that are there in the
for (const auto& v : visitedChild) { // After visiting all the childs that I could, *then*, I set my tmp_count to the sum of my child's tmp_count
goodToTest = true;
dq_ptr->tmp_count += v->tmp_count; // Then, setting up the count to the sum of the supports from my fully visited children
// dq_ptr->current_leaf = dq_ptr->current_leaf && ((v->current_leaf) && (v->tmp_count < minsupport)); // Setting myself as a leaf
}
} else
visitable.erase(dq_ptr); // B: If I am not allowed to visit all the children, as they have not fully visited yet, re-set myself in the visiting queue
// the queue guarantees that, if I have not visited another child, then, I will eventually put back in the queue by it in (C), so this does not misses a node
}
if (visitable.contains(dq_ptr)) { // If the visit was successful and (B) did not happen
if (goodToTest && (dq_ptr->tmp_count >= minsupport)) { // testing the support only if I am either a leaf or whether I have already visited all my childs, so to not
auto it4 = toTraverseNext.emplace(dq_ptr->value, dq_ptr->tmp_count); // Setting the current support in the table for the current item
if (!it4.second)
it4.first->second += dq_ptr->tmp_count; // If a previous value was already there, then just update it!
if (current != dq_ptr->value) new_firsttraverse.emplace(dq_ptr->value, dq_ptr); // If this was not met before, then setting the current node as first step in the list to be visited
in_tree.insert(dq_ptr); // Definitively, if the support is good, this should be part of the next iteration
}
if (view.contains(dq_ptr->parent)) // If the parent is an allowed node (still, it should be)
dq.push(dq_ptr->parent); // C: putting the parent as the next step to visit
}
}
}
}
prefix.insert(current); // Adding the current node as part of the prefix
results.emplace_back(curr_support, prefix); // setting this as another result
for (const auto& [next,with_supp] : toTraverseNext) { // Using the updated support table to traverse the remaining nodes
if ((next != current) && ( !prefix.contains(next))) // Visiting next only other strings that were not part of the prefix
fpgrowth_expand(tree, next, with_supp, prefix, in_tree, new_firsttraverse, results, minsupport);
}
}
template <typename T>
static inline std::vector<std::pair<size_t, std::unordered_set<const T*>>> fpgrowth(const std::vector<std::unordered_set<T>>& obj,
std::vector<std::pair<T, size_t>>& final_element_for_scan,
size_t minsupport = 1) {
struct fpgrowth_node<T> tree{nullptr};
final_element_for_scan.clear();
std::unordered_set<struct fpgrowth_node<T>*> in_tree;
std::unordered_map<const T*, struct fpgrowth_node<T>*> last_pointer, firstpointer;
std::unordered_map<const T*, std::unordered_set<struct fpgrowth_node<T>*>> createdElements;
{
std::map<T, size_t> element; // Fast representation for the support table, for efficiently associating the item to the support value
for (auto it = obj.begin(); it != obj.end(); it++) {
for (const T& x : *it) {
auto it2 = element.emplace(x, 1);
if (!it2.second) it2.first->second++;
}
}
// moving this to a vector, while keeping the items with adequate support
for (auto it = element.begin(), en = element.end(); it != en; it++) {
if (it->second >= minsupport)
final_element_for_scan.emplace_back(*it);
}
}
// Sorting the elements in the table by decreasing support
std::sort(final_element_for_scan.begin(), final_element_for_scan.end(), [](const std::pair<T, size_t>& l, const std::pair<T, size_t>& r) {
return l.second>r.second;
});
struct fpgrowth_node<T>* ptr ;
for (auto itx = obj.begin(); itx != obj.end(); itx++) { // For each transaction
auto e = itx->end();
ptr = &tree;
auto it_left = final_element_for_scan.begin();
auto e_left = final_element_for_scan.end();
while ((it_left != e_left)) {
auto f = itx->find(it_left->first);
if (f == e) {it_left++; continue;}
else {
// For each element in the transaction which is also in the support table
auto it3 = last_pointer.emplace(&it_left->first, nullptr);
// Expanding the trie with a new child
ptr = ptr->add_child(&it_left->first);
// Using a set of visited nodes to avoid loops in the list
bool newInsertion = createdElements[&it_left->first].emplace(ptr).second;
in_tree.insert(ptr); // This node shoudl be part of the view
if (it3.second) {
// First pointer of the list for traversing the newly established leaves at each novel iteration efficiently
firstpointer.emplace(&it_left->first, ptr);
it3.first->second = ptr;
} else {
if (!it3.first->second) {
// First pointer of the list for traversing the newly established leaves at each novel iteration efficiently
it3.first->second = ptr;
} else if (newInsertion) {
// Avoiding loops: adding an element in the list only if required
it3.first->second->list = ptr;
it3.first->second = ptr;
}
}
it_left++;
}
}
}
createdElements.clear();
// results of the itemsets with their support to be returned
std::vector<std::pair<size_t, std::unordered_set<const T*>>> results;
// Setting up the new projection of the tree, so to avoid the re-creation of the trees multiple times:
// setting up a view with the allowed nodes will be more efficient (as it will save the extra allocation cost for
// the nodes, and the only memory overhead is just the pointer to the nodes in the FPGrowth tree)
for (auto rit = final_element_for_scan.rbegin(), ren = final_element_for_scan.rend(); rit != ren; rit++) {
fpgrowth_expand(ptr, (const T *) &rit->first, rit->second, std::unordered_set<const T *>{}, in_tree, firstpointer, results,
minsupport);
}
return results;
}
int main() {
std::vector<std::unordered_set<std::string>> S;
S.emplace_back(std::unordered_set<std::string>{"m","o","n","k","e","y"});
S.emplace_back(std::unordered_set<std::string>{"d","o","n","k","e","y"});
S.emplace_back(std::unordered_set<std::string>{"m","a","k","e"});
S.emplace_back(std::unordered_set<std::string>{"m","u","c","k","y"});
S.emplace_back(std::unordered_set<std::string>{"c","o","c","k","i","e"});
std::vector<std::pair<std::string, size_t>> single_support;
for (const auto& ref : fpgrowth(S, single_support, 3)) {
std::cout << "Item: ";
for (const auto& ptr : ref.second)
std::cout << *ptr << " ";
std::cout << " with support " << ref.first << std::endl;
}
return 0;
}
```发布于 2023-05-20 15:33:55
)
启用编译器和DO2警告,并修复所有这些警告。Doyxgen抱怨几个没有文档的类成员,编译器抱怨#import (这应该是#include)、缺少#include <algorithm>和fpgrowth_node构造函数中初始化程序列表的错误顺序。我建议您使用默认成员渐增器来修复后者。
tree中的fpgrowth_expand()编译器没有捕捉到的,因为它看起来使用,是参数tree在fpgrowth_expand()。将其传递给递归调用,但不使用此参数执行任何其他操作。它在那里是无用的,应该移除。
之后重复struct和class
与C语言不同的是,在声明了struct或class之后,您可以只使用它们的名称来引用它们,而不必再用struct或class作为前缀。
尽量避免调用new和delete,而是使用容器或智能指针为您管理内存。它们将确保您不会忘记删除内容,并且在引发异常时也会进行正确的操作。在您的代码中,这很简单:与其将指向fpgrowth_node的指针存储在children中,不如直接存储fpgrowth_nodes:
template <typename T>
struct fpgrowth_node {
…
std::unordered_map<const T*, fpgrowth_node> children;
…
public:
…
fpgrowth_node* add_child(const T* value) {
auto [it, is_new] = children.try_emplace(value, value);
auto& child = it->second;
if (is_new) {
child.parent = this;
child.height = height + 1;
} else {
child.height = std::max(height + 1, child.height);
child.count++;
}
return &child;
}
…
};请注意,您也不再需要这种方式的析构函数。
您已经考虑过通过在item类型上模板化算法来使算法更通用。然而,您仍然需要输入的形式是std::vectors的std::unordered_sets的这些项目。但如果你选择的是std::deque of std::sets呢?您可以使您的类和函数更加通用。考虑:
template <typename Dataset>
auto fpgrowth(const Dataset& dataset, …) {
// Derive the types of transactions and items
using Itemset = Dataset::value_type;
using T = Itemset::value_type;
…
}大部分代码将保持不变。当然,fpgrowth()的第二个参数是一个问题,因为它依赖于T。但是,这实际上是一个out参数,您可以将其作为返回值的一部分,或者将整个final_element_for_scan类型也设置为模板类型。
for循环在一些地方使用range-for循环,但通常使用旧风格的for循环。喜欢在可能的情况下使用范围-for。例如,在fpgrowth()开始时:
{
std::unordered_map<T, size_t> elements;
for (auto& transaction: dataset) {
for (auto& item: transaction) {
elements.emplace(item, 0).first->second++;
}
}
for (auto& [item, count]: elements) {
if (count >= minsupport)
final_element_for_scan.emplace_back(item, count);
}
}
}由于您使用的是C++20,所以我还使用了结构化绑定来避免在第二个for-loop中编写.first和.second。这使我想到:
无论何时使用std::pair,您都要为这两个成员命名-- .first和.second。当你有成对的时候,这些名字就没有什么意义了,你可以写一些像foo.first->second->bar这样的东西,而且在这一点上很难理解代码。
有多种方法可以避免这种情况。首先,您可以使用引用和结构化绑定为一对成员临时提供更好的名称,就像我在add_child()中所做的那样。但是,如果可能的话,更好的做法是不要使用std::pair,只需创建一个具有两个更好名称的成员的struct。考虑:
struct element_count {
T item;
std::size_t count;
};
std::vector<element_count> final_elements_for_scan;和:
struct itemset_support {
Itemset items;
std::size_t support;
};
std::vector<itemset_support> result;与此相关的是,如果您经常使用嵌套类型(如std::unordered_set<fpgrowth_node<T>*> ),则为其创建一个具有明确名称的类型别名,如:
using Nodeset = std::unordered_set<fpgrowth_node<T>*>;
…
Nodeset in_tree;
std::unordered_map<const T*, Nodeset> createdElements;此外,尽量避免缩写过多的名称,并避免使用过于通用的名称。看到it、it2、itx、e、it_left、e_left和it3都在一个函数中不是很好。如前所述,您更喜欢range-for,所以您只需要处理值,而不是迭代器。如果你做了缩写,至少要保持一致;不要在一个地方写element,在另一个地方写e。
在格式化名称的方式上也要保持一致。我在一些地方看到snake_case,在另一些地方看到camelCase。你选择哪种方式并不重要,只要你在任何地方都使用相同的风格。
如果可能的话,考虑使用基于C++20's范围的算法。这避免了您必须指定开始迭代器和结束迭代器,而只需引用整个容器,因此输入的次数较少,出错的可能性也较小。
我看到了std::map和std::unordered_map。我看到成员变量tmp_count在fpgrowth_node中,但是局部函数变量visitable和in_tree。所有这些都有一定的成本,而且算法的性能也会受到很大的影响。例如,std::map中的查找和插入是O(\log N),而std::unordered_map中的查找和插入是O(1)。
还请考虑这些数据结构的内存使用情况。std::map和std::unordered_map都可以为插入的每个项分配单独的堆内存。因此,如果您可以使in_tree成为fpgrowth_node的bool成员变量,这将节省大量内存。
如果可能的话,也考虑合并映射。为什么有单独的last_pointer、firstpointer和createdElements映射,而它们都是由同一个键索引的?这些映射可以通过创建一个包含三个原始映射的值部分的struct进行合并。
https://codereview.stackexchange.com/questions/285070
复制相似问题