1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
use polars_core::frame::groupby::IntoGroupsProxy;
use polars_core::utils::Wrap;
use super::*;
pub trait ToDummies<T> {
fn to_dummies(&self) -> PolarsResult<DataFrame> {
Err(PolarsError::InvalidOperation(
"to_dummies is not implemented for this dtype".into(),
))
}
}
#[cfg(feature = "dtype-u8")]
type DummyType = u8;
#[cfg(feature = "dtype-u8")]
type DummyCa = UInt8Chunked;
#[cfg(not(feature = "dtype-u8"))]
type DummyType = i32;
#[cfg(not(feature = "dtype-u8"))]
type DummyCa = Int32Chunked;
fn dummies_helper(mut groups: Vec<IdxSize>, len: usize, name: &str) -> DummyCa {
groups.sort_unstable();
let mut av = vec![0 as DummyType; len];
for idx in groups {
let elem = unsafe { av.get_unchecked_mut(idx as usize) };
*elem = 1;
}
ChunkedArray::from_vec(name, av)
}
fn sort_columns(mut columns: Vec<Series>) -> Vec<Series> {
columns.sort_by(|a, b| a.name().partial_cmp(b.name()).unwrap());
columns
}
impl ToDummies<Utf8Type> for Wrap<Utf8Chunked> {
fn to_dummies(&self) -> PolarsResult<DataFrame> {
let ca = &self.0;
let groups = ca.group_tuples(true, false)?.into_idx();
let col_name = ca.name();
let taker = ca.take_rand();
let columns = groups
.into_par_iter()
.map(|(first, groups)| {
let name = match unsafe { taker.get_unchecked(first as usize) } {
Some(val) => format!("{col_name}_{val}"),
None => format!("{col_name}_null"),
};
let ca = dummies_helper(groups, self.len(), &name);
ca.into_series()
})
.collect();
Ok(DataFrame::new_no_checks(sort_columns(columns)))
}
}
#[cfg(feature = "dtype-categorical")]
impl ToDummies<Utf8Type> for Wrap<CategoricalChunked> {
fn to_dummies(&self) -> PolarsResult<DataFrame> {
let rev_map = self.get_rev_map();
let groups = self.logical().group_tuples(true, false)?.into_idx();
let col_name = self.name();
let taker = self.logical().take_rand();
let columns = groups
.into_par_iter()
.map(|(first, groups)| {
let name = match unsafe { taker.get_unchecked(first as usize) } {
Some(val) => {
let name = rev_map.get(val);
format!("{col_name}_{name}")
}
None => format!("{col_name}_null"),
};
let ca = dummies_helper(groups, self.len(), &name);
ca.into_series()
})
.collect();
Ok(DataFrame::new_no_checks(sort_columns(columns)))
}
}
impl<T> ToDummies<T> for ChunkedArray<T>
where
T: PolarsIntegerType + Sync,
T::Native: NumericNative,
{
fn to_dummies(&self) -> PolarsResult<DataFrame> {
let groups = self.group_tuples(true, false)?.into_idx();
let col_name = self.name();
let taker = self.take_rand();
let columns = groups
.into_par_iter()
.map(|(first, groups)| {
let name = match unsafe { taker.get_unchecked(first as usize) } {
Some(val) => format!("{col_name}_{val}"),
None => format!("{col_name}_null"),
};
let ca = dummies_helper(groups, self.len(), &name);
ca.into_series()
})
.collect();
Ok(DataFrame::new_no_checks(sort_columns(columns)))
}
}
impl<T: PolarsFloatType> ToDummies<Float32Type> for WrapFloat<ChunkedArray<T>> {}
impl ToDummies<Wrap<BooleanType>> for Wrap<BooleanChunked> {
fn to_dummies(&self) -> PolarsResult<DataFrame> {
let ca = self.cast(&DataType::Int8)?;
ca.to_ops().to_dummies()
}
}