htsvcf_core/
header.rs

1//! VCF/BCF header representation and manipulation.
2//!
3//! This module provides the [`Header`] struct, which wraps an htslib `bcf_hdr_t`
4//! pointer and provides safe access to header metadata including sample names,
5//! INFO/FORMAT field definitions, and header modification.
6//!
7//! # Ownership
8//!
9//! [`Header`] owns its underlying `bcf_hdr_t*` via duplication. This ensures
10//! correct lifetime management and thread safety - the header can outlive the
11//! reader it was created from.
12//!
13//! # Caching
14//!
15//! Sample names and tag ID-to-name mappings are cached at construction time
16//! for O(1) lookups during variant processing.
17//!
18//! # Example
19//!
20//! ```no_run
21//! use htsvcf_core::header::Header;
22//! use rust_htslib::bcf::{self, Read};
23//!
24//! let reader = bcf::Reader::from_path("input.vcf.gz").unwrap();
25//! let header = unsafe { Header::new(reader.header().inner) };
26//!
27//! // Access sample information
28//! println!("Samples: {:?}", header.sample_names());
29//!
30//! // Look up INFO field type
31//! if let Some((ty, len)) = header.info_type(b"DP") {
32//!     println!("DP is {:?} with length {:?}", ty, len);
33//! }
34//! ```
35
36use rust_htslib::bcf;
37use rust_htslib::bcf::header::{HeaderRecord, TagLength, TagType};
38use std::collections::HashMap;
39use std::ffi::CString;
40use std::mem::ManuallyDrop;
41use std::sync::atomic::{AtomicBool, Ordering};
42use std::sync::{Arc, OnceLock};
43
44impl Drop for Header {
45    fn drop(&mut self) {
46        unsafe {
47            rust_htslib::htslib::bcf_hdr_destroy(self.inner);
48        }
49    }
50}
51
52/// A VCF/BCF header with metadata and sample information.
53///
54/// `Header` owns its underlying `bcf_hdr_t*` via duplication. It provides
55/// access to sample names, INFO/FORMAT field definitions, and supports
56/// header modification (adding INFO/FORMAT fields).
57///
58/// # Caching
59///
60/// Sample names and tag ID-to-name mappings are cached at construction time
61/// for O(1) lookups during variant processing.
62#[derive(Debug)]
63pub struct Header {
64    inner: *mut rust_htslib::htslib::bcf_hdr_t,
65    dirty: AtomicBool,
66    /// Cached sample names in header order.
67    sample_names: Vec<String>,
68    /// Cached map from sample name to index for O(1) lookup.
69    sample_name_to_idx: HashMap<String, usize>,
70    /// Cached map from tag ID to (name_string, name_bytes) for O(1) lookup.
71    /// This covers both INFO and FORMAT tags since they share the ID namespace.
72    id_to_name_cache: HashMap<u32, (String, Vec<u8>)>,
73
74    // Non-owning, process-lifetime HeaderView used for record translation.
75    //
76    // We cannot hand a record an `Arc<HeaderView>` that will run `bcf_hdr_destroy`
77    // on our `Header`'s `inner` pointer. To avoid duplicating the header for
78    // translation while also preventing a double-free, we create an `Arc<HeaderView>`
79    // whose `Drop` never runs by leaking it.
80    translate_view: OnceLock<Arc<rust_htslib::bcf::header::HeaderView>>,
81}
82
83/// Represents a field definition from the VCF header (INFO, FORMAT, or FILTER).
84#[derive(Debug, Clone)]
85pub struct HeaderField {
86    /// The field ID (e.g., "DP", "GQ").
87    pub id: String,
88    /// The field type ("Integer", "Float", "String", "Flag").
89    pub r#type: String,
90    /// The Number field ("1", "A", "R", "G", ".").
91    pub number: String,
92    /// The description from the header.
93    pub description: String,
94}
95
96unsafe impl Send for Header {}
97unsafe impl Sync for Header {}
98
99impl Header {
100    /// Duplicate the underlying header and take ownership.
101    ///
102    /// This is important for thread safety and correct lifetime management: the
103    /// returned `Header` owns its internal `bcf_hdr_t*` and frees it on drop.
104    /// # Safety
105    ///
106    /// `inner` must be a valid pointer to a `bcf_hdr_t`.
107    pub unsafe fn new(inner: *mut rust_htslib::htslib::bcf_hdr_t) -> Self {
108        let inner = rust_htslib::htslib::bcf_hdr_dup(inner);
109        let view = ManuallyDrop::new(bcf::header::HeaderView::new(inner));
110        let sample_count = view.sample_count();
111        let (sample_names, sample_name_to_idx) = if sample_count > 0 {
112            let names: Vec<String> = view
113                .samples()
114                .iter()
115                .map(|s| String::from_utf8_lossy(s).into_owned())
116                .collect();
117            let name_to_idx: HashMap<String, usize> = names
118                .iter()
119                .enumerate()
120                .map(|(i, name)| (name.clone(), i))
121                .collect();
122            (names, name_to_idx)
123        } else {
124            (Vec::new(), HashMap::new())
125        };
126
127        // Build id_to_name cache from INFO and FORMAT records
128        let mut id_to_name_cache: HashMap<u32, (String, Vec<u8>)> = HashMap::new();
129        for record in view.header_records() {
130            let tag_id = match &record {
131                HeaderRecord::Info { values, .. } | HeaderRecord::Format { values, .. } => values
132                    .iter()
133                    .find(|(k, _)| k.as_str() == "ID")
134                    .map(|(_, v)| v.as_str()),
135                _ => None,
136            };
137            if let Some(tag_name) = tag_id {
138                let tag_bytes = tag_name.as_bytes();
139                if let Ok(id) = view.name_to_id(tag_bytes) {
140                    let name = tag_name.to_string();
141                    let bytes = tag_bytes.to_vec();
142                    id_to_name_cache.insert(id.0, (name, bytes));
143                }
144            }
145        }
146
147        Self {
148            inner,
149            dirty: AtomicBool::new(false),
150            sample_names,
151            sample_name_to_idx,
152            id_to_name_cache,
153            translate_view: OnceLock::new(),
154        }
155    }
156
157    /// Create an empty header (for writing new VCFs).
158    pub fn empty() -> Self {
159        let c_str = CString::new(&b"w"[..]).unwrap();
160        let inner = unsafe { rust_htslib::htslib::bcf_hdr_init(c_str.as_ptr()) };
161        Self {
162            inner,
163            dirty: AtomicBool::new(false),
164            sample_names: Vec::new(),
165            sample_name_to_idx: HashMap::new(),
166            id_to_name_cache: HashMap::new(),
167            translate_view: OnceLock::new(),
168        }
169    }
170
171    /// Get the raw `bcf_hdr_t` pointer.
172    ///
173    /// # Safety
174    ///
175    /// The returned pointer is valid for the lifetime of this `Header`.
176    pub fn inner_ptr(&self) -> *mut rust_htslib::htslib::bcf_hdr_t {
177        self.inner
178    }
179
180    pub fn translate_view(&self) -> Arc<rust_htslib::bcf::header::HeaderView> {
181        // Create a HeaderView that points at `self.inner` but never gets dropped.
182        //
183        // `bcf::Record` stores an `Arc<HeaderView>`. `HeaderView::Drop` destroys the
184        // underlying `bcf_hdr_t*`, which would double-free because `Header` also owns
185        // and destroys `self.inner`. To avoid duplicating the header for translation,
186        // we intentionally leak one strong ref to the `HeaderView` so its `Drop` never
187        // runs (the strong count never reaches zero).
188        self.translate_view
189            .get_or_init(|| {
190                let view = Arc::new(rust_htslib::bcf::header::HeaderView::new(self.inner));
191                std::mem::forget(Arc::clone(&view)); // Leak one ref to prevent drop
192                view
193            })
194            .clone()
195    }
196
197    /// Get a temporary header view (internal use).
198    fn view(&self) -> ManuallyDrop<bcf::header::HeaderView> {
199        ManuallyDrop::new(bcf::header::HeaderView::new(self.inner))
200    }
201
202    /// Get all header records (INFO, FORMAT, FILTER, contig, etc.).
203    pub fn header_records(&self) -> Vec<HeaderRecord> {
204        self.view().header_records()
205    }
206
207    /// Get the sample index for a sample name.
208    ///
209    /// Returns `None` if the sample is not found.
210    pub fn sample_id(&self, sample: &[u8]) -> Option<usize> {
211        match self.view().sample_to_id(sample) {
212            Ok(id) => Some(id.0 as usize),
213            Err(_) => None,
214        }
215    }
216
217    /// Get the tag name for a numeric ID (INFO/FORMAT).
218    ///
219    /// This performs a fresh lookup; prefer [`id_to_name_cached`](Self::id_to_name_cached)
220    /// for repeated calls.
221    pub fn id_to_name(&self, id: u32) -> Vec<u8> {
222        self.view().id_to_name(bcf::header::Id(id))
223    }
224
225    /// Get the cached name for a tag ID, returning both String and bytes.
226    ///
227    /// Falls back to [`id_to_name`](Self::id_to_name) if not in cache
228    /// (e.g., for dynamically added tags).
229    pub fn id_to_name_cached(&self, id: u32) -> (String, Vec<u8>) {
230        if let Some(cached) = self.id_to_name_cache.get(&id) {
231            return cached.clone();
232        }
233        // Fallback for tags added after construction
234        let bytes = self.view().id_to_name(bcf::header::Id(id));
235        let name = String::from_utf8_lossy(&bytes).into_owned();
236        (name, bytes)
237    }
238
239    /// Get the number of samples in the VCF.
240    pub fn sample_count(&self) -> usize {
241        self.sample_names.len()
242    }
243
244    /// Get all sample names in header order.
245    pub fn sample_names(&self) -> &[String] {
246        &self.sample_names
247    }
248
249    /// Get the index of a sample by name.
250    ///
251    /// Returns `None` if the sample is not found.
252    pub fn sample_idx(&self, name: &str) -> Option<usize> {
253        self.sample_name_to_idx.get(name).copied()
254    }
255
256    /// Get a reference to the sample name-to-index map.
257    pub fn sample_name_to_idx(&self) -> &HashMap<String, usize> {
258        &self.sample_name_to_idx
259    }
260
261    /// Get the type and length of an INFO field.
262    ///
263    /// Returns `None` if the tag is not defined in the header.
264    pub fn info_type(&self, tag: &[u8]) -> Option<(TagType, TagLength)> {
265        self.view().info_type(tag).ok()
266    }
267
268    /// Get the type and length of a FORMAT field.
269    ///
270    /// Returns `None` if the tag is not defined in the header.
271    pub fn format_type(&self, tag: &[u8]) -> Option<(TagType, TagLength)> {
272        self.view().format_type(tag).ok()
273    }
274
275    /// Synchronize header changes to the underlying htslib structure.
276    ///
277    /// Called automatically after modifications; usually not needed directly.
278    pub fn sync(&self) {
279        if !self.dirty.swap(false, Ordering::AcqRel) {
280            return;
281        }
282        unsafe {
283            rust_htslib::htslib::bcf_hdr_sync(self.inner);
284        }
285    }
286
287    /// Append a raw header line (e.g., `##INFO=<...>`).
288    ///
289    /// Returns `true` on success, `false` on failure.
290    pub fn push_record(&self, record: &[u8]) -> bool {
291        let Ok(c_str) = CString::new(record) else {
292            return false;
293        };
294        let r = unsafe { rust_htslib::htslib::bcf_hdr_append(self.inner, c_str.as_ptr()) };
295        self.dirty.store(true, Ordering::Release);
296        self.sync();
297        r == 0
298    }
299
300    /// Add an INFO field definition to the header.
301    ///
302    /// # Arguments
303    ///
304    /// * `id` - Field ID (e.g., "DP")
305    /// * `number` - Number field ("1", "A", "R", "G", ".")
306    /// * `ty` - Type ("Integer", "Float", "String", "Flag")
307    /// * `description` - Human-readable description
308    ///
309    /// Returns `true` on success.
310    pub fn add_info(&self, id: &str, number: &str, ty: &str, description: &str) -> bool {
311        let record =
312            format!("##INFO=<ID={id},Number={number},Type={ty},Description=\"{description}\">",);
313        self.push_record(record.as_bytes())
314    }
315
316    /// Add a FORMAT field definition to the header.
317    ///
318    /// # Arguments
319    ///
320    /// * `id` - Field ID (e.g., "GQ")
321    /// * `number` - Number field ("1", "A", "R", "G", ".")
322    /// * `ty` - Type ("Integer", "Float", "String")
323    /// * `description` - Human-readable description
324    ///
325    /// Returns `true` on success.
326    pub fn add_format(&self, id: &str, number: &str, ty: &str, description: &str) -> bool {
327        let record =
328            format!("##FORMAT=<ID={id},Number={number},Type={ty},Description=\"{description}\">",);
329        self.push_record(record.as_bytes())
330    }
331
332    /// Format the header as a VCF header string.
333    ///
334    /// Returns `None` if formatting fails.
335    pub fn to_string(&self) -> Option<String> {
336        self.sync();
337
338        let mut s = rust_htslib::htslib::kstring_t {
339            l: 0,
340            m: 0,
341            s: std::ptr::null_mut(),
342        };
343
344        let ret = unsafe { rust_htslib::htslib::bcf_hdr_format(self.inner_ptr(), 0, &mut s) };
345        if ret != 0 {
346            if !s.s.is_null() {
347                unsafe { rust_htslib::htslib::free(s.s as *mut std::os::raw::c_void) };
348            }
349            return None;
350        }
351
352        let bytes = unsafe { std::slice::from_raw_parts(s.s as *const u8, s.l as usize) };
353        let text = String::from_utf8_lossy(bytes).into_owned();
354
355        if !s.s.is_null() {
356            unsafe { rust_htslib::htslib::free(s.s as *mut std::os::raw::c_void) };
357        }
358
359        Some(text)
360    }
361
362    /// Get a specific field definition by section and ID.
363    ///
364    /// # Arguments
365    ///
366    /// * `section` - "INFO" or "FORMAT"
367    /// * `id` - Field ID (e.g., "DP")
368    ///
369    /// Returns `None` if the field is not found.
370    pub fn get_field(&self, section: &str, id: &str) -> Option<HeaderField> {
371        let tag_info = match section {
372            "INFO" => self.info_type(id.as_bytes()),
373            "FORMAT" => self.format_type(id.as_bytes()),
374            _ => return None,
375        };
376
377        let (tag_type, tag_length) = tag_info?;
378
379        let mut description = String::new();
380        for record in self.header_records() {
381            match record {
382                HeaderRecord::Info { values, .. } if section == "INFO" => {
383                    if values.iter().any(|(k, v)| k.as_str() == "ID" && v == id) {
384                        description = values
385                            .iter()
386                            .find(|(k, _)| k.as_str() == "Description")
387                            .map(|(_, v)| unquote(v))
388                            .unwrap_or_default();
389                        break;
390                    }
391                }
392                HeaderRecord::Format { values, .. } if section == "FORMAT" => {
393                    if values.iter().any(|(k, v)| k.as_str() == "ID" && v == id) {
394                        description = values
395                            .iter()
396                            .find(|(k, _)| k.as_str() == "Description")
397                            .map(|(_, v)| unquote(v))
398                            .unwrap_or_default();
399                        break;
400                    }
401                }
402                _ => {}
403            }
404        }
405
406        Some(HeaderField {
407            id: id.to_string(),
408            r#type: tag_type_to_str(tag_type).to_string(),
409            number: tag_length_to_str(tag_length),
410            description,
411        })
412    }
413
414    /// Get all INFO, FORMAT, and FILTER field definitions.
415    ///
416    /// Returns a vector of (section, field) tuples.
417    pub fn all_fields(&self) -> Vec<(String, HeaderField)> {
418        let mut fields = Vec::new();
419        for record in self.header_records() {
420            match record {
421                HeaderRecord::Info { values, .. } => {
422                    if let Some(field) =
423                        self.parse_record_to_field("INFO", values.into_iter().collect())
424                    {
425                        fields.push(("INFO".to_string(), field));
426                    }
427                }
428                HeaderRecord::Format { values, .. } => {
429                    if let Some(field) =
430                        self.parse_record_to_field("FORMAT", values.into_iter().collect())
431                    {
432                        fields.push(("FORMAT".to_string(), field));
433                    }
434                }
435                HeaderRecord::Filter { values, .. } => {
436                    if let Some(field) =
437                        self.parse_record_to_field("FILTER", values.into_iter().collect())
438                    {
439                        fields.push(("FILTER".to_string(), field));
440                    }
441                }
442                _ => {}
443            }
444        }
445        fields
446    }
447
448    /// Parse a header record into a HeaderField (internal use).
449    fn parse_record_to_field(
450        &self,
451        section: &str,
452        values: Vec<(String, String)>,
453    ) -> Option<HeaderField> {
454        let id = values
455            .iter()
456            .find(|(k, _)| k.as_str() == "ID")
457            .map(|(_, v)| v.as_str())?;
458
459        let (tag_type, tag_length) = match section {
460            "INFO" => self.info_type(id.as_bytes())?,
461            "FORMAT" => self.format_type(id.as_bytes())?,
462            "FILTER" => (TagType::Flag, TagLength::Fixed(0)), // FILTER is implicitly a flag-like type
463            _ => return None,
464        };
465
466        let description = values
467            .iter()
468            .find(|(k, _)| k.as_str() == "Description")
469            .map(|(_, v)| unquote(v))
470            .unwrap_or_default();
471
472        Some(HeaderField {
473            id: id.to_string(),
474            r#type: tag_type_to_str(tag_type).to_string(),
475            number: tag_length_to_str(tag_length),
476            description,
477        })
478    }
479}
480
481fn unquote(s: &str) -> String {
482    if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
483        s[1..s.len() - 1].to_string()
484    } else {
485        s.to_string()
486    }
487}
488
489fn tag_type_to_str(t: TagType) -> &'static str {
490    match t {
491        TagType::Flag => "Flag",
492        TagType::Integer => "Integer",
493        TagType::Float => "Float",
494        TagType::String => "String",
495    }
496}
497
498fn tag_length_to_str(l: TagLength) -> String {
499    match l {
500        TagLength::Fixed(n) => n.to_string(),
501        TagLength::AltAlleles => "A".to_string(),
502        TagLength::Alleles => "R".to_string(),
503        TagLength::Genotypes => "G".to_string(),
504        TagLength::Variable => ".".to_string(),
505    }
506}