1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
use polars_core::frame::groupby::IntoGroupsProxy;
use polars_core::utils::Wrap;

use super::*;

pub trait ToDummies<T> {
    fn to_dummies(&self) -> PolarsResult<DataFrame> {
        Err(PolarsError::InvalidOperation(
            "to_dummies is not implemented for this dtype".into(),
        ))
    }
}

#[cfg(feature = "dtype-u8")]
type DummyType = u8;
#[cfg(feature = "dtype-u8")]
type DummyCa = UInt8Chunked;

#[cfg(not(feature = "dtype-u8"))]
type DummyType = i32;
#[cfg(not(feature = "dtype-u8"))]
type DummyCa = Int32Chunked;

fn dummies_helper(mut groups: Vec<IdxSize>, len: usize, name: &str) -> DummyCa {
    groups.sort_unstable();

    // let mut group_member_iter = groups.into_iter();
    let mut av = vec![0 as DummyType; len];

    for idx in groups {
        let elem = unsafe { av.get_unchecked_mut(idx as usize) };
        *elem = 1;
    }

    ChunkedArray::from_vec(name, av)
}

fn sort_columns(mut columns: Vec<Series>) -> Vec<Series> {
    columns.sort_by(|a, b| a.name().partial_cmp(b.name()).unwrap());
    columns
}

impl ToDummies<Utf8Type> for Wrap<Utf8Chunked> {
    fn to_dummies(&self) -> PolarsResult<DataFrame> {
        let ca = &self.0;
        let groups = ca.group_tuples(true, false)?.into_idx();
        let col_name = ca.name();
        let taker = ca.take_rand();

        let columns = groups
            .into_par_iter()
            .map(|(first, groups)| {
                let name = match unsafe { taker.get_unchecked(first as usize) } {
                    Some(val) => format!("{col_name}_{val}"),
                    None => format!("{col_name}_null"),
                };
                let ca = dummies_helper(groups, self.len(), &name);
                ca.into_series()
            })
            .collect();

        Ok(DataFrame::new_no_checks(sort_columns(columns)))
    }
}

#[cfg(feature = "dtype-categorical")]
impl ToDummies<Utf8Type> for Wrap<CategoricalChunked> {
    fn to_dummies(&self) -> PolarsResult<DataFrame> {
        let rev_map = self.get_rev_map();

        let groups = self.logical().group_tuples(true, false)?.into_idx();
        let col_name = self.name();
        let taker = self.logical().take_rand();

        let columns = groups
            .into_par_iter()
            .map(|(first, groups)| {
                let name = match unsafe { taker.get_unchecked(first as usize) } {
                    Some(val) => {
                        let name = rev_map.get(val);
                        format!("{col_name}_{name}")
                    }
                    None => format!("{col_name}_null"),
                };
                let ca = dummies_helper(groups, self.len(), &name);
                ca.into_series()
            })
            .collect();

        Ok(DataFrame::new_no_checks(sort_columns(columns)))
    }
}

impl<T> ToDummies<T> for ChunkedArray<T>
where
    T: PolarsIntegerType + Sync,
    T::Native: NumericNative,
{
    fn to_dummies(&self) -> PolarsResult<DataFrame> {
        let groups = self.group_tuples(true, false)?.into_idx();
        let col_name = self.name();
        let taker = self.take_rand();

        let columns = groups
            .into_par_iter()
            .map(|(first, groups)| {
                let name = match unsafe { taker.get_unchecked(first as usize) } {
                    Some(val) => format!("{col_name}_{val}"),
                    None => format!("{col_name}_null"),
                };

                let ca = dummies_helper(groups, self.len(), &name);
                ca.into_series()
            })
            .collect();

        Ok(DataFrame::new_no_checks(sort_columns(columns)))
    }
}

impl<T: PolarsFloatType> ToDummies<Float32Type> for WrapFloat<ChunkedArray<T>> {}
impl ToDummies<Wrap<BooleanType>> for Wrap<BooleanChunked> {
    fn to_dummies(&self) -> PolarsResult<DataFrame> {
        let ca = self.cast(&DataType::Int8)?;
        ca.to_ops().to_dummies()
    }
}