feat: add theta sketch (part 1)#45
Conversation
8f1e59f to
38c61a0
Compare
| /// Canonicalize double value for compatibility with Java | ||
| fn canonical_double(value: f64) -> i64 { | ||
| if value == 0.0 { | ||
| 0i64 | ||
| } else if value.is_nan() { | ||
| 0x7ff8000000000000i64 // Java's Double.doubleToLongBits() NaN value | ||
| } else { | ||
| value.to_bits() as i64 | ||
| } | ||
| } |
There was a problem hiding this comment.
We may reduce one branch by leveraging this code snippet - https://github.com/reem/rust-ordered-float/blame/25da208e3e6cca1a1f9b1fcfeaec9e53f6497fa0/src/lib.rs#L2172-L2212
| //! # Serialization | ||
| //! | ||
| //! Sketches can be serialized and deserialized while preserving all state, enabling | ||
| //! cross-platform sketch exchange with other Apache DataSketches implementations | ||
| //! (Java, C++, Python). |
There was a problem hiding this comment.
I tend to remove this section until we implement it.
| //! - **ThetaUnion**: For computing union of multiple theta sketches | ||
| //! - **ThetaIntersection**: For computing intersection of multiple theta sketches |
| } | ||
|
|
||
| /// Get cardinality estimate | ||
| pub fn get_estimate(&self) -> f64 { |
There was a problem hiding this comment.
| pub fn get_estimate(&self) -> f64 { | |
| pub fn estimate(&self) -> f64 { |
Similar to this comment - #44 (comment)
There was a problem hiding this comment.
Can you retry the workflows? Or shall I.
There was a problem hiding this comment.
Nope. The style check should be resolved by running cargo x lint --fix locally. It's a formatting issue where there are extra blank lines.
There was a problem hiding this comment.
Can you retry the workflows? Or shall I.
No need. It's a permanent failure before a new commit applying the formatter.
There was a problem hiding this comment.
No. I think with a write commit I can push a new commit to this PR but I'm yet to have one.
Currently, @ZENOTME would handle these comments and commit for updates and fixes.
| /// | ||
| /// # Panics | ||
| /// | ||
| /// If lg_k is not in range [MIN_LG_K, MAX_LG_K] |
There was a problem hiding this comment.
| /// If lg_k is not in range [MIN_LG_K, MAX_LG_K] | |
| /// If lg_k is not in range [5, 26] |
Make it clear in the docs.
| /// # Panics | ||
| /// | ||
| /// If lg_k is not in range [MIN_LG_K, MAX_LG_K] | ||
| pub fn set_lg_k(mut self, lg_k: u8) -> Self { |
There was a problem hiding this comment.
| pub fn set_lg_k(mut self, lg_k: u8) -> Self { | |
| pub fn lg_k(mut self, lg_k: u8) -> Self { |
I suppose we can also drop the set_ prefix in a builder.
| /// Default seed for hash function | ||
| pub const DEFAULT_SEED: u64 = 9001; |
There was a problem hiding this comment.
Once #48 merged you can leverage the shared DEFAULT_UPDATE_SEED as other impls do.
|
@ZENOTME To be clear, you make some design decisions to mapping:
.. to this Since it's not a 1:1 mapping, may you describe the concept and struct mapping details? I'm not quite familiar with ThetaSketch so I'd count on @leerho on verifying the design correctness. |
In C++ and Java, the class hierarchy of the theta sketch looks like this: In Rust it doesn't have inheritance. So we only have two types: For implementation, in C++, For implementation in Java, it has more than one implementation inheriting from UpdateThetaSketch: DirectQuickSelectSketch, ConcurrentDirectQuickSelectSketch, and HeapAlphaSketch... There are different ways to abstract the implementation: ThetaSketch or wrap it as Box internally. I'm not sure we should consider this now, and is there any good practice for this? @tisonkun |
- add hash table for theta sketch - add ThetaSketch
38c61a0 to
b2bcd76
Compare
|
Is this ready to merge? |
I have fixed the review above, and it's mergeable. cc @tisonkun |
| return false; | ||
| } | ||
|
|
||
| assert!(self.entries[index] == 0, "Entry should be empty"); |
There was a problem hiding this comment.
| assert!(self.entries[index] == 0, "Entry should be empty"); | |
| assert_eq!(self.entries[index], 0, "Entry should be empty"); |
| assert!( | ||
| num_inserted == k as usize, | ||
| "Number of inserted entries should be equal to k." | ||
| ); |
There was a problem hiding this comment.
| assert!( | |
| num_inserted == k as usize, | |
| "Number of inserted entries should be equal to k." | |
| ); | |
| assert_eq!( | |
| num_inserted, k as usize, | |
| "Number of inserted entries should be equal to k." | |
| ); |
| /// Specific hash table for theta sketch | ||
| /// | ||
| /// It maintain a array capacity max to [2 ** (lg_max_size)]: | ||
| /// - Before it reach the max capacity, it will extend the array based on resize_factor. | ||
| /// - After it reach the capacity bigger than 2 ** (lg_nom_size), every time the number of entries | ||
| /// exceeds the threshold, it will rebuild the table: only keep the min 2 ** lg_nom_size entries | ||
| /// and update the theta to the k-th smallest entry. |
There was a problem hiding this comment.
| /// Specific hash table for theta sketch | |
| /// | |
| /// It maintain a array capacity max to [2 ** (lg_max_size)]: | |
| /// - Before it reach the max capacity, it will extend the array based on resize_factor. | |
| /// - After it reach the capacity bigger than 2 ** (lg_nom_size), every time the number of entries | |
| /// exceeds the threshold, it will rebuild the table: only keep the min 2 ** lg_nom_size entries | |
| /// and update the theta to the k-th smallest entry. | |
| /// Specific hash table for theta sketch | |
| /// | |
| /// It maintains an array capacity max to 2^lg_max_size: | |
| /// * Before it reaches the max capacity, it will extend the array based on resize_factor. | |
| /// * After it reaches the capacity bigger than 2^lg_nom_size, every time the number of entries | |
| /// exceeds the threshold, it will rebuild the table: only keep the min 2^lg_nom_size entries | |
| /// and update the theta to the k-th smallest entry. |
|
|
||
| /// Find an entry in the hash table. | ||
| /// | ||
| /// Returns the index of the entry if found, otherwise None. The entry may has been inserted or |
There was a problem hiding this comment.
| /// Returns the index of the entry if found, otherwise None. The entry may has been inserted or | |
| /// Returns the index of the entry if found, otherwise None. The entry may have been inserted or |
|
|
||
| /// Find index in a given entries. | ||
| /// | ||
| /// Returns the index of the entry if found, otherwise None. The entry may has been inserted or |
There was a problem hiding this comment.
| /// Returns the index of the entry if found, otherwise None. The entry may has been inserted or | |
| /// Returns the index of the entry if found, otherwise None. The entry may have been inserted or |
tisonkun
left a comment
There was a problem hiding this comment.
LGTM.
Some style issues can be resolved here or as a follow-up.
resolved |
c44408d to
a726052
Compare
|
@ZENOTME, I saw this comment above:
See the Quick Summary. |
Part 1 of #30:
cc @notfilippo @Xuanwo @PsiACE @tisonkun