libmsnumpress
Numerical compression schemes for proteomics mass spectrometry data
MSNumpress.hpp
Go to the documentation of this file.
1 /*
2  MSNumpress.hpp
3  johan.teleman@immun.lth.se
4 
5  Copyright 2013 Johan Teleman
6 
7  Licensed under the Apache License, Version 2.0 (the "License");
8  you may not use this file except in compliance with the License.
9  You may obtain a copy of the License at
10 
11  http://www.apache.org/licenses/LICENSE-2.0
12 
13  Unless required by applicable law or agreed to in writing, software
14  distributed under the License is distributed on an "AS IS" BASIS,
15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  See the License for the specific language governing permissions and
17  limitations under the License.
18  */
19 /*
20  ==================== encodeInt ====================
21  Some of the encodings described below use a integer compression referred to simply as
22 
23  encodeInt()
24 
25  This encoding works on a 4 byte integer, by truncating initial zeros or ones.
26  If the initial (most significant) half byte is 0x0 or 0xf, the number of such
27  halfbytes starting from the most significant is stored in a halfbyte. This initial
28  count is then followed by the rest of the ints halfbytes, in little-endian order.
29  A count halfbyte c of
30 
31  0 <= c <= 8 is interpreted as an initial c 0x0 halfbytes
32  9 <= c <= 15 is interpreted as an initial (c-8) 0xf halfbytes
33 
34  Ex:
35  int c rest
36  0 => 0x8
37  -1 => 0xf 0xf
38  23 => 0x6 0x7 0x1
39  */
40 
41 #ifndef _MSNUMPRESS_HPP_
42 #define _MSNUMPRESS_HPP_
43 
44 #include <cstddef>
45 #include <vector>
46 
47 // defines whether to throw an exception when a number cannot be encoded safely
48 // with the given parameters
49 #ifndef THROW_ON_OVERFLOW
50 #define THROW_ON_OVERFLOW true
51 #endif
52 
53 namespace ms {
54 namespace numpress {
55 
56 namespace MSNumpress {
57 
58  /**
59  * Compute the maximal linear fixed point that prevents integer overflow.
60  *
61  * @data pointer to array of double to be encoded (need memorycont. repr.)
62  * @dataSize number of doubles from *data to encode
63  *
64  * @return the linear fixed point safe to use
65  */
67  const double *data,
68  size_t dataSize);
69 
70  /**
71  * Compute the optimal linear fixed point with a desired m/z accuracy.
72  *
73  * @note If the desired accuracy cannot be reached without overflowing 64
74  * bit integers, then a negative value is returned. You need to check for
75  * this and in that case abandon numpress or use optimalLinearFixedPoint
76  * which returns the largest safe value.
77  *
78  * @data pointer to array of double to be encoded (need memorycont. repr.)
79  * @dataSize number of doubles from *data to encode
80  * @mass_acc desired m/z accuracy in Th
81  *
82  * @return the linear fixed point that satisfies the accuracy requirement (or -1 in case of failure).
83  */
85  const double *data,
86  size_t dataSize,
87  double mass_acc);
88 
89  /**
90  * Encodes the doubles in data by first using a
91  * - lossy conversion to a 4 byte 5 decimal fixed point representation
92  * - storing the residuals from a linear prediction after first two values
93  * - encoding by encodeInt (see above)
94  *
95  * The resulting binary is maximally 8 + dataSize * 5 bytes, but much less if the
96  * data is reasonably smooth on the first order.
97  *
98  * This encoding is suitable for typical m/z or retention time binary arrays.
99  * On a test set, the encoding was empirically show to be accurate to at least 0.002 ppm.
100  *
101  * @data pointer to array of double to be encoded (need memorycont. repr.)
102  * @dataSize number of doubles from *data to encode
103  * @result pointer to where resulting bytes should be stored
104  * @fixedPoint the scaling factor used for getting the fixed point repr.
105  * This is stored in the binary and automatically extracted
106  * on decoding.
107  * @return the number of encoded bytes
108  */
109  size_t encodeLinear(
110  const double *data,
111  const size_t dataSize,
112  unsigned char *result,
113  double fixedPoint);
114 
115  /**
116  * Calls lower level encodeLinear while handling vector sizes appropriately
117  *
118  * @data vector of doubles to be encoded
119  * @result vector of resulting bytes (will be resized to the number of bytes)
120  */
121  void encodeLinear(
122  const std::vector<double> &data,
123  std::vector<unsigned char> &result,
124  double fixedPoint);
125 
126  /**
127  * Decodes data encoded by encodeLinear.
128  *
129  * result vector guaranteed to be shorter or equal to (|data| - 8) * 2
130  *
131  * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e.
132  * that the last encoded int does not use the last byte in the data. In addition the last encoded
133  * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
134  *
135  * @data pointer to array of bytes to be decoded (need memorycont. repr.)
136  * @dataSize number of bytes from *data to decode
137  * @result pointer to were resulting doubles should be stored
138  * @return the number of decoded doubles, or -1 if dataSize < 4 or 4 < dataSize < 8
139  */
140  size_t decodeLinear(
141  const unsigned char *data,
142  const size_t dataSize,
143  double *result);
144 
145  /**
146  * Calls lower level decodeLinear while handling vector sizes appropriately
147  *
148  * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e..
149  * that the last encoded int does not use the last byte in the data. In addition the last encoded
150  * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
151  *
152  * @data vector of bytes to be decoded
153  * @result vector of resulting double (will be resized to the number of doubles)
154  */
155  void decodeLinear(
156  const std::vector<unsigned char> &data,
157  std::vector<double> &result);
158 
159 /////////////////////////////////////////////////////////////
160 
161 
162  /**
163  * Encodes the doubles in data by storing the residuals from a linear prediction after first two values.
164  *
165  * The resulting binary is the same size as the input data.
166  *
167  * This encoding is suitable for typical m/z or retention time binary arrays, and is
168  * intended to be used before zlib compression to improve compression.
169  *
170  * @data pointer to array of doubles to be encoded (need memorycont. repr.)
171  * @dataSize number of doubles from *data to encode
172  * @result pointer to were resulting bytes should be stored
173  */
174  size_t encodeSafe(
175  const double *data,
176  const size_t dataSize,
177  unsigned char *result);
178 
179 
180  /**
181  * Decodes data encoded by encodeSafe.
182  *
183  * result vector is the same size as the input data.
184  *
185  * Might throw const char* is something goes wrong during decoding.
186  *
187  * @data pointer to array of bytes to be decoded (need memorycont. repr.)
188  * @dataSize number of bytes from *data to decode
189  * @result pointer to were resulting doubles should be stored
190  * @return the number of decoded bytes
191  */
192  size_t decodeSafe(
193  const unsigned char *data,
194  const size_t dataSize,
195  double *result);
196 
197 /////////////////////////////////////////////////////////////
198 
199  /**
200  * Encodes ion counts by simply rounding to the nearest 4 byte integer,
201  * and compressing each integer with encodeInt.
202  *
203  * The handleable range is therefore 0 -> 4294967294.
204  * The resulting binary is maximally dataSize * 5 bytes, but much less if the
205  * data is close to 0 on average.
206  *
207  * @data pointer to array of double to be encoded (need memorycont. repr.)
208  * @dataSize number of doubles from *data to encode
209  * @result pointer to were resulting bytes should be stored
210  * @return the number of encoded bytes
211  */
212  size_t encodePic(
213  const double *data,
214  const size_t dataSize,
215  unsigned char *result);
216 
217  /**
218  * Calls lower level encodePic while handling vector sizes appropriately
219  *
220  * @data vector of doubles to be encoded
221  * @result vector of resulting bytes (will be resized to the number of bytes)
222  */
223  void encodePic(
224  const std::vector<double> &data,
225  std::vector<unsigned char> &result);
226 
227  /**
228  * Decodes data encoded by encodePic
229  *
230  * result vector guaranteed to be shorter of equal to |data| * 2
231  *
232  * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e.
233  * that the last encoded int does not use the last byte in the data. In addition the last encoded
234  * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
235  *
236  * @data pointer to array of bytes to be decoded (need memorycont. repr.)
237  * @dataSize number of bytes from *data to decode
238  * @result pointer to were resulting doubles should be stored
239  * @return the number of decoded doubles
240  */
241  size_t decodePic(
242  const unsigned char *data,
243  const size_t dataSize,
244  double *result);
245 
246  /**
247  * Calls lower level decodePic while handling vector sizes appropriately
248  *
249  * Note that this method may throw a const char* if it deems the input data to be corrupt, i.e.
250  * that the last encoded int does not use the last byte in the data. In addition the last encoded
251  * int need to use either the last halfbyte, or the second last followed by a 0x0 halfbyte.
252  *
253  * @data vector of bytes to be decoded
254  * @result vector of resulting double (will be resized to the number of doubles)
255  */
256  void decodePic(
257  const std::vector<unsigned char> &data,
258  std::vector<double> &result);
259 
260 /////////////////////////////////////////////////////////////
261 
262 
263  double optimalSlofFixedPoint(
264  const double *data,
265  size_t dataSize);
266 
267  /**
268  * Encodes ion counts by taking the natural logarithm, and storing a
269  * fixed point representation of this. This is calculated as
270  *
271  * unsigned short fp = log(d + 1) * fixedPoint + 0.5
272  *
273  * the result vector is exactly |data| * 2 + 8 bytes long
274  *
275  * @data pointer to array of double to be encoded (need memorycont. repr.)
276  * @dataSize number of doubles from *data to encode
277  * @result pointer to were resulting bytes should be stored
278  * @return the number of encoded bytes
279  */
280  size_t encodeSlof(
281  const double *data,
282  const size_t dataSize,
283  unsigned char *result,
284  double fixedPoint);
285 
286  /**
287  * Calls lower level encodeSlof while handling vector sizes appropriately
288  *
289  * @data vector of doubles to be encoded
290  * @result vector of resulting bytes (will be resized to the number of bytes)
291  */
292  void encodeSlof(
293  const std::vector<double> &data,
294  std::vector<unsigned char> &result,
295  double fixedPoint);
296 
297  /**
298  * Decodes data encoded by encodeSlof
299  *
300  * The return will include exactly (|data| - 8) / 2 doubles.
301  *
302  * Note that this method may throw a const char* if it deems the input data to be corrupt.
303  *
304  * @data pointer to array of bytes to be decoded (need memorycont. repr.)
305  * @dataSize number of bytes from *data to decode
306  * @result pointer to were resulting doubles should be stored
307  * @return the number of decoded doubles
308  */
309  size_t decodeSlof(
310  const unsigned char *data,
311  const size_t dataSize,
312  double *result);
313 
314  /**
315  * Calls lower level decodeSlof while handling vector sizes appropriately
316  *
317  * Note that this method may throw a const char* if it deems the input data to be corrupt.
318  *
319  * @data vector of bytes to be decoded
320  * @result vector of resulting double (will be resized to the number of doubles)
321  */
322  void decodeSlof(
323  const std::vector<unsigned char> &data,
324  std::vector<double> &result);
325 
326 } // namespace MSNumpress
327 } // namespace msdata
328 } // namespace pwiz
329 
330 #endif // _MSNUMPRESS_HPP_
ms::numpress::MSNumpress::decodeSlof
size_t decodeSlof(const unsigned char *data, const size_t dataSize, double *result)
Definition: MSNumpress.cpp:733
ms::numpress::MSNumpress::optimalSlofFixedPoint
double optimalSlofFixedPoint(const double *data, size_t dataSize)
Definition: MSNumpress.cpp:679
ms::numpress::MSNumpress::optimalLinearFixedPointMass
double optimalLinearFixedPointMass(const double *data, size_t dataSize, double mass_acc)
Definition: MSNumpress.cpp:213
ms::numpress::MSNumpress::encodePic
size_t encodePic(const double *data, size_t dataSize, unsigned char *result)
Definition: MSNumpress.cpp:568
ms::numpress::MSNumpress::decodeSafe
size_t decodeSafe(const unsigned char *data, const size_t dataSize, double *result)
Definition: MSNumpress.cpp:510
ms::numpress::MSNumpress::decodeLinear
size_t decodeLinear(const unsigned char *data, const size_t dataSize, double *result)
Definition: MSNumpress.cpp:361
ms::numpress::MSNumpress::optimalLinearFixedPoint
double optimalLinearFixedPoint(const double *data, size_t dataSize)
Definition: MSNumpress.cpp:234
ms
Definition: MSNumpress.cpp:26
ms::numpress::MSNumpress::encodeLinear
size_t encodeLinear(const double *data, size_t dataSize, unsigned char *result, double fixedPoint)
Definition: MSNumpress.cpp:270
ms::numpress::MSNumpress::encodeSlof
size_t encodeSlof(const double *data, size_t dataSize, unsigned char *result, double fixedPoint)
Definition: MSNumpress.cpp:704
ms::numpress::MSNumpress::decodePic
size_t decodePic(const unsigned char *data, const size_t dataSize, double *result)
Definition: MSNumpress.cpp:617
ms::numpress::MSNumpress::encodeSafe
size_t encodeSafe(const double *data, const size_t dataSize, unsigned char *result)
Definition: MSNumpress.cpp:464