LLVM 22.0.0git
SuffixTreeNode.h
Go to the documentation of this file.
1//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines nodes for use within a SuffixTree.
10//
11// Each node has either no children or at least two children, with the root
12// being a exception in the empty tree.
13//
14// Children are represented as a map between unsigned integers and nodes. If
15// a node N has a child M on unsigned integer k, then the mapping represented
16// by N is a proper prefix of the mapping represented by M. Note that this,
17// although similar to a trie is somewhat different: each node stores a full
18// substring of the full mapping rather than a single character state.
19//
20// Each internal node contains a pointer to the internal node representing
21// the same string, but with the first character chopped off. This is stored
22// in \p Link. Each leaf node stores the start index of its respective
23// suffix in \p SuffixIdx.
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
27#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
28#include "llvm/ADT/DenseMap.h"
30
31namespace llvm {
32
33/// A node in a suffix tree which represents a substring or suffix.
35public:
36 /// Represents an undefined index in the suffix tree.
37 static const unsigned EmptyIdx = -1;
38 enum class NodeKind { ST_Leaf, ST_Internal };
39
40private:
41 const NodeKind Kind;
42
43 /// The start index of this node's substring in the main string.
44 unsigned StartIdx = EmptyIdx;
45
46 /// The length of the string formed by concatenating the edge labels from
47 /// the root to this node.
48 unsigned ConcatLen = 0;
49
50 /// These two indices give a range of indices for its leaf descendants.
51 /// Imagine drawing a tree on paper and assigning a unique index to each leaf
52 /// node in monotonically increasing order from left to right. This way of
53 /// numbering the leaf nodes allows us to associate a continuous range of
54 /// indices with each internal node. For example, if a node has leaf
55 /// descendants with indices i, i+1, ..., j, then its LeftLeafIdx is i and
56 /// its RightLeafIdx is j. These indices are for LeafNodes in the SuffixTree
57 /// class, which is constructed using post-order depth-first traversal.
58 unsigned LeftLeafIdx = EmptyIdx;
59 unsigned RightLeafIdx = EmptyIdx;
60
61public:
62 // LLVM RTTI boilerplate.
63 NodeKind getKind() const { return Kind; }
64
65 /// \return the start index of this node's substring in the entire string.
66 LLVM_ABI unsigned getStartIdx() const;
67
68 /// \returns the end index of this node.
69 virtual unsigned getEndIdx() const = 0;
70
71 /// \return the index of this node's left most leaf node.
72 LLVM_ABI unsigned getLeftLeafIdx() const;
73
74 /// \return the index of this node's right most leaf node.
75 LLVM_ABI unsigned getRightLeafIdx() const;
76
77 /// Set the index of the left most leaf node of this node to \p Idx.
78 LLVM_ABI void setLeftLeafIdx(unsigned Idx);
79
80 /// Set the index of the right most leaf node of this node to \p Idx.
81 LLVM_ABI void setRightLeafIdx(unsigned Idx);
82
83 /// Advance this node's StartIdx by \p Inc.
84 LLVM_ABI void incrementStartIdx(unsigned Inc);
85
86 /// Set the length of the string from the root to this node to \p Len.
87 LLVM_ABI void setConcatLen(unsigned Len);
88
89 /// \returns the length of the string from the root to this node.
90 LLVM_ABI unsigned getConcatLen() const;
91
92 SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
93 : Kind(Kind), StartIdx(StartIdx) {}
94 virtual ~SuffixTreeNode() = default;
95};
96
97// A node with two or more children, or the root.
99private:
100 /// The end index of this node's substring in the main string.
101 ///
102 /// Every leaf node must have its \p EndIdx incremented at the end of every
103 /// step in the construction algorithm. To avoid having to update O(N)
104 /// nodes individually at the end of every step, the end index is stored
105 /// as a pointer.
106 unsigned EndIdx = EmptyIdx;
107
108 /// A pointer to the internal node representing the same sequence with the
109 /// first character chopped off.
110 ///
111 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
112 /// Ukkonen's algorithm does to achieve linear-time construction is
113 /// keep track of which node the next insert should be at. This makes each
114 /// insert O(1), and there are a total of O(N) inserts. The suffix link
115 /// helps with inserting children of internal nodes.
116 ///
117 /// Say we add a child to an internal node with associated mapping S. The
118 /// next insertion must be at the node representing S - its first character.
119 /// This is given by the way that we iteratively build the tree in Ukkonen's
120 /// algorithm. The main idea is to look at the suffixes of each prefix in the
121 /// string, starting with the longest suffix of the prefix, and ending with
122 /// the shortest. Therefore, if we keep pointers between such nodes, we can
123 /// move to the next insertion point in O(1) time. If we don't, then we'd
124 /// have to query from the root, which takes O(N) time. This would make the
125 /// construction algorithm O(N^2) rather than O(N).
126 SuffixTreeInternalNode *Link = nullptr;
127
128public:
129 // LLVM RTTI boilerplate.
130 static bool classof(const SuffixTreeNode *N) {
131 return N->getKind() == NodeKind::ST_Internal;
132 }
133
134 /// \returns true if this node is the root of its owning \p SuffixTree.
135 bool isRoot() const;
136
137 /// \returns the end index of this node's substring in the entire string.
138 unsigned getEndIdx() const override;
139
140 /// Sets \p Link to \p L. Assumes \p L is not null.
141 void setLink(SuffixTreeInternalNode *L);
142
143 /// \returns the pointer to the Link node.
144 SuffixTreeInternalNode *getLink() const;
145
146 /// The children of this node.
147 ///
148 /// A child existing on an unsigned integer implies that from the mapping
149 /// represented by the current node, there is a way to reach another
150 /// mapping by tacking that character on the end of the current string.
152
153 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
155 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
156 Link(Link) {}
157
158 virtual ~SuffixTreeInternalNode() = default;
159};
160
161// A node representing a suffix.
163private:
164 /// The start index of the suffix represented by this leaf.
165 unsigned SuffixIdx = EmptyIdx;
166
167 /// The end index of this node's substring in the main string.
168 ///
169 /// Every leaf node must have its \p EndIdx incremented at the end of every
170 /// step in the construction algorithm. To avoid having to update O(N)
171 /// nodes individually at the end of every step, the end index is stored
172 /// as a pointer.
173 unsigned *EndIdx = nullptr;
174
175public:
176 // LLVM RTTI boilerplate.
177 static bool classof(const SuffixTreeNode *N) {
178 return N->getKind() == NodeKind::ST_Leaf;
179 }
180
181 /// \returns the end index of this node's substring in the entire string.
182 unsigned getEndIdx() const override;
183
184 /// \returns the start index of the suffix represented by this leaf.
185 unsigned getSuffixIdx() const;
186
187 /// Sets the start index of the suffix represented by this leaf to \p Idx.
188 void setSuffixIdx(unsigned Idx);
189 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
190 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
191
192 virtual ~SuffixTreeLeafNode() = default;
193};
194} // namespace llvm
195#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
#define LLVM_ABI
Definition Compiler.h:213
This file defines the DenseMap class.
This is an optimization pass for GlobalISel generic memory operations.
#define N
Determine the kind of a node from its type.
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
virtual ~SuffixTreeInternalNode()=default
static bool classof(const SuffixTreeNode *N)
SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, SuffixTreeInternalNode *Link)
virtual ~SuffixTreeLeafNode()=default
static bool classof(const SuffixTreeNode *N)
SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
LLVM_ABI void incrementStartIdx(unsigned Inc)
Advance this node's StartIdx by Inc.
NodeKind getKind() const
LLVM_ABI void setConcatLen(unsigned Len)
Set the length of the string from the root to this node to Len.
LLVM_ABI unsigned getLeftLeafIdx() const
virtual ~SuffixTreeNode()=default
LLVM_ABI unsigned getRightLeafIdx() const
LLVM_ABI void setRightLeafIdx(unsigned Idx)
Set the index of the right most leaf node of this node to Idx.
SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
LLVM_ABI unsigned getConcatLen() const
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
LLVM_ABI unsigned getStartIdx() const
LLVM_ABI void setLeftLeafIdx(unsigned Idx)
Set the index of the left most leaf node of this node to Idx.
virtual unsigned getEndIdx() const =0