datasketches-cpp
compact_theta_sketch_parser_impl.hpp
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #ifndef COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
21 #define COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
22 
23 #include <iostream>
24 #include <iomanip>
25 #include <stdexcept>
26 
27 namespace datasketches {
28 
29 template<typename T>
30 T whole_bytes_to_hold_bits(T bits) {
31  static_assert(std::is_integral<T>::value, "integral type expected");
32  return (bits >> 3) + ((bits & 7) > 0);
33 }
34 
35 template<bool dummy>
36 auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
37  check_memory_size(ptr, size, 8, dump_on_error);
38  checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
39  uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
40  switch(serial_version) {
41  case 4: {
42  // version 4 sketches are ordered and always have entries (single item in exact mode is v3)
43  const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
44  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
45  const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 1;
46  uint64_t theta = theta_constants::MAX_THETA;
47  if (has_theta) {
48  check_memory_size(ptr, size, 16, dump_on_error);
49  theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_V4_THETA_U64];
50  }
51  const uint8_t num_entries_bytes = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_NUM_ENTRIES_BYTES_BYTE];
52  size_t data_offset_bytes = has_theta ? COMPACT_SKETCH_V4_PACKED_DATA_ESTIMATION_BYTE : COMPACT_SKETCH_V4_PACKED_DATA_EXACT_BYTE;
53  check_memory_size(ptr, size, data_offset_bytes + num_entries_bytes, dump_on_error);
54  uint32_t num_entries = 0;
55  const uint8_t* num_entries_ptr = reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes;
56  for (unsigned i = 0; i < num_entries_bytes; ++i) {
57  num_entries |= (*num_entries_ptr++) << (i << 3);
58  }
59  data_offset_bytes += num_entries_bytes;
60  const uint8_t entry_bits = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_V4_ENTRY_BITS_BYTE];
61  const size_t expected_bits = entry_bits * num_entries;
62  const size_t expected_size_bytes = data_offset_bytes + whole_bytes_to_hold_bits(expected_bits);
63  check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
64  return {false, true, seed_hash, num_entries, theta,
65  reinterpret_cast<const uint8_t*>(ptr) + data_offset_bytes, entry_bits};
66  }
67  case 3: {
68  uint64_t theta = theta_constants::MAX_THETA;
69  const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
70  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
71  return {true, true, seed_hash, 0, theta, nullptr, 64};
72  }
73  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
74  const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
75  if (has_theta) {
76  check_memory_size(ptr, size, (COMPACT_SKETCH_THETA_U64 + 1) * sizeof(uint64_t), dump_on_error);
77  theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
78  }
79  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
80  check_memory_size(ptr, size, 16, dump_on_error);
81  return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64, 64};
82  }
83  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
84  const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
85  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
86  const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
87  check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
88  const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
89  return {false, is_ordered, seed_hash, num_entries, theta, entries, 64};
90  }
91  case 1: {
92  uint16_t seed_hash = compute_seed_hash(seed);
93  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
94  uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
95  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
96  if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
97  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
98  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
99  check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
100  return {false, true, seed_hash, num_entries, theta, entries, 64};
101  }
102  case 2: {
103  uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
104  const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
105  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
106  if (preamble_size == 1) {
107  return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
108  } else if (preamble_size == 2) {
109  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
110  if (num_entries == 0) {
111  return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr, 64};
112  } else {
113  const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
114  check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
115  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
116  return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries, 64};
117  }
118  } else if (preamble_size == 3) {
119  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
120  uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
121  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
122  if (is_empty) return {true, true, seed_hash, 0, theta, nullptr, 64};
123  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
124  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
125  check_memory_size(ptr, size, expected_size_bytes, dump_on_error);
126  return {false, true, seed_hash, num_entries, theta, entries, 64};
127  } else {
128  throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
129  }
130  }
131  default:
132  throw std::invalid_argument("unsupported serial version " + std::to_string(serial_version));
133  }
134 }
135 
136 template<bool dummy>
137 void compact_theta_sketch_parser<dummy>::check_memory_size(const void* ptr, size_t actual_bytes, size_t expected_bytes, bool dump_on_error) {
138  if (actual_bytes < expected_bytes) throw std::out_of_range("at least " + std::to_string(expected_bytes)
139  + " bytes expected, actual " + std::to_string(actual_bytes)
140  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), actual_bytes)) : ""));
141 }
142 
143 template<bool dummy>
144 std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
145  std::stringstream s;
146  s << std::hex << std::setfill('0') << std::uppercase;
147  for (size_t i = 0; i < size; ++i) s << std::setw(2) << (ptr[i] & 0xff);
148  return s.str();
149 }
150 
151 } /* namespace datasketches */
152 
153 #endif
const uint64_t MAX_THETA
max theta - signed max for compatibility with Java
Definition: theta_constants.hpp:36
DataSketches namespace.
Definition: binomial_bounds.hpp:38