Lean  $LEAN_TAG$
PandasConverter.cs
1 /*
2  * QUANTCONNECT.COM - Democratizing Finance, Empowering Individuals.
3  * Lean Algorithmic Trading Engine v2.0. Copyright 2014 QuantConnect Corporation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14 */
15 
16 using Python.Runtime;
17 using QuantConnect.Data;
20 using QuantConnect.Util;
21 using System;
22 using System.Collections.Generic;
23 using System.Linq;
24 
25 namespace QuantConnect.Python
26 {
27  /// <summary>
28  /// Collection of methods that converts lists of objects in pandas.DataFrame
29  /// </summary>
30  public class PandasConverter
31  {
32  private static dynamic _pandas;
33  private static PyObject _concat;
34 
35  /// <summary>
36  /// Initializes the <see cref="PandasConverter"/> class
37  /// </summary>
38  static PandasConverter()
39  {
40  using (Py.GIL())
41  {
42  var pandas = Py.Import("pandas");
43  _pandas = pandas;
44  // keep it so we don't need to ask for it each time
45  _concat = pandas.GetAttr("concat");
46  }
47  }
48 
49  /// <summary>
50  /// Converts an enumerable of <see cref="Slice"/> in a pandas.DataFrame
51  /// </summary>
52  /// <param name="data">Enumerable of <see cref="Slice"/></param>
53  /// <param name="dataType">Optional type of bars to add to the data frame</param>
54  /// <returns><see cref="PyObject"/> containing a pandas.DataFrame</returns>
55  public PyObject GetDataFrame(IEnumerable<Slice> data, Type dataType = null)
56  {
57  var maxLevels = 0;
58  var sliceDataDict = new Dictionary<SecurityIdentifier, PandasData>();
59 
60  // if no data type is requested we check all
61  var requestedTick = dataType == null || dataType == typeof(Tick) || dataType == typeof(OpenInterest);
62  var requestedTradeBar = dataType == null || dataType == typeof(TradeBar);
63  var requestedQuoteBar = dataType == null || dataType == typeof(QuoteBar);
64 
65  foreach (var slice in data)
66  {
67  AddSliceDataTypeDataToDict(slice, requestedTick, requestedTradeBar, requestedQuoteBar, sliceDataDict, ref maxLevels, dataType);
68  }
69 
70  return CreateDataFrame(sliceDataDict, maxLevels);
71  }
72 
73  /// <summary>
74  /// Converts an enumerable of <see cref="IBaseData"/> in a pandas.DataFrame
75  /// </summary>
76  /// <param name="data">Enumerable of <see cref="Slice"/></param>
77  /// <param name="symbolOnlyIndex">Whether to make the index only the symbol, without time or any other index levels</param>
78  /// <returns><see cref="PyObject"/> containing a pandas.DataFrame</returns>
79  /// <remarks>Helper method for testing</remarks>
80  public PyObject GetDataFrame<T>(IEnumerable<T> data, bool symbolOnlyIndex = false)
81  where T : ISymbolProvider
82  {
83  var pandasDataBySymbol = new Dictionary<SecurityIdentifier, PandasData>();
84  var maxLevels = 0;
85  foreach (var datum in data)
86  {
87  var pandasData = GetPandasDataValue(pandasDataBySymbol, datum.Symbol, datum, ref maxLevels);
88  pandasData.Add(datum);
89  }
90 
91  if (symbolOnlyIndex)
92  {
93  return PandasData.ToPandasDataFrame(pandasDataBySymbol.Values);
94  }
95  return CreateDataFrame(pandasDataBySymbol,
96  // Use 2 instead of maxLevels for backwards compatibility
97  maxLevels: symbolOnlyIndex ? 1 : 2,
98  sort: false,
99  // Multiple data frames (one for each symbol) will be concatenated,
100  // so make sure rows with missing values only are not filtered out before concatenation
101  filterMissingValueColumns: pandasDataBySymbol.Count <= 1);
102  }
103 
104  /// <summary>
105  /// Converts a dictionary with a list of <see cref="IndicatorDataPoint"/> in a pandas.DataFrame
106  /// </summary>
107  /// <param name="data">Dictionary with a list of <see cref="IndicatorDataPoint"/></param>
108  /// <returns><see cref="PyObject"/> containing a pandas.DataFrame</returns>
109  public PyObject GetIndicatorDataFrame(IEnumerable<KeyValuePair<string, List<IndicatorDataPoint>>> data)
110  {
111  using (Py.GIL())
112  {
113  var pyDict = new PyDict();
114 
115  foreach (var kvp in data)
116  {
117  AddSeriesToPyDict(kvp.Key, kvp.Value, pyDict);
118  }
119 
120  return MakeIndicatorDataFrame(pyDict);
121  }
122  }
123 
124  /// <summary>
125  /// Converts a dictionary with a list of <see cref="IndicatorDataPoint"/> in a pandas.DataFrame
126  /// </summary>
127  /// <param name="data"><see cref="PyObject"/> that should be a dictionary (convertible to PyDict) of string to list of <see cref="IndicatorDataPoint"/></param>
128  /// <returns><see cref="PyObject"/> containing a pandas.DataFrame</returns>
129  public PyObject GetIndicatorDataFrame(PyObject data)
130  {
131  using (Py.GIL())
132  {
133  using var inputPythonType = data.GetPythonType();
134  var inputTypeStr = inputPythonType.ToString();
135  var targetTypeStr = nameof(PyDict);
136  PyObject currentKvp = null;
137 
138  try
139  {
140  using var pyDictData = new PyDict(data);
141  using var seriesPyDict = new PyDict();
142 
143  targetTypeStr = $"{nameof(String)}: {nameof(List<IndicatorDataPoint>)}";
144 
145  foreach (var kvp in pyDictData.Items())
146  {
147  currentKvp = kvp;
148  AddSeriesToPyDict(kvp[0].As<string>(), kvp[1].As<List<IndicatorDataPoint>>(), seriesPyDict);
149  }
150 
151  return MakeIndicatorDataFrame(seriesPyDict);
152  }
153  catch (Exception e)
154  {
155  if (currentKvp != null)
156  {
157  inputTypeStr = $"{currentKvp[0].GetPythonType()}: {currentKvp[1].GetPythonType()}";
158  }
159 
160  throw new ArgumentException(Messages.PandasConverter.ConvertToDictionaryFailed(inputTypeStr, targetTypeStr, e.Message), e);
161  }
162  }
163  }
164 
165  /// <summary>
166  /// Returns a string that represent the current object
167  /// </summary>
168  /// <returns></returns>
169  public override string ToString()
170  {
171  if (_pandas == null)
172  {
174  }
175 
176  using (Py.GIL())
177  {
178  return _pandas.Repr();
179  }
180  }
181 
182  /// <summary>
183  /// Create a data frame by concatenated the resulting data frames from the given data
184  /// </summary>
185  private static PyObject CreateDataFrame(Dictionary<SecurityIdentifier, PandasData> dataBySymbol, int maxLevels = 2, bool sort = true,
186  bool filterMissingValueColumns = true)
187  {
188  using (Py.GIL())
189  {
190  if (dataBySymbol.Count == 0)
191  {
192  return _pandas.DataFrame();
193  }
194 
195  var dataFrames = dataBySymbol.Select(x => x.Value.ToPandasDataFrame(maxLevels, filterMissingValueColumns));
196  var result = ConcatDataFrames(dataFrames, sort: sort, dropna: true);
197 
198  foreach (var df in dataFrames)
199  {
200  df.Dispose();
201  }
202 
203  return result;
204  }
205  }
206 
207  /// <summary>
208  /// Concatenates multiple data frames
209  /// </summary>
210  /// <param name="dataFrames">The data frames to concatenate</param>
211  /// <param name="keys">
212  /// Optional new keys for a new multi-index level that would be added
213  /// to index each individual data frame in the resulting one
214  /// </param>
215  /// <param name="names">The optional names of the new index level (and the existing ones if they need to be changed)</param>
216  /// <param name="sort">Whether to sort the resulting data frame</param>
217  /// <param name="dropna">Whether to drop columns containing NA values only (Nan, None, etc)</param>
218  /// <returns>A new data frame result from concatenating the input</returns>
219  public static PyObject ConcatDataFrames(IEnumerable<PyObject> dataFrames, IEnumerable<object> keys = null, IEnumerable<string> names = null,
220  bool sort = true, bool dropna = true)
221  {
222  using (Py.GIL())
223  {
224  var dataFramesList = dataFrames.ToList();
225  if (dataFramesList.Count == 0)
226  {
227  return _pandas.DataFrame();
228  }
229 
230  using var pyDataFrames = dataFramesList.ToPyListUnSafe();
231  using var kwargs = Py.kw("sort", sort);
232  PyList pyKeys = null;
233  PyList pyNames = null;
234 
235  if (keys != null && names != null)
236  {
237  pyKeys = keys.ToPyListUnSafe();
238  pyNames = names.ToPyListUnSafe();
239  kwargs.SetItem("keys", pyKeys);
240  kwargs.SetItem("names", pyNames);
241  }
242 
243  var result = _concat.Invoke(new[] { pyDataFrames }, kwargs);
244 
245  // Drop columns with only NaN or None values
246  if (dropna)
247  {
248  using var dropnaKwargs = Py.kw("axis", 1, "inplace", true, "how", "all");
249  result.GetAttr("dropna").Invoke(Array.Empty<PyObject>(), dropnaKwargs);
250  }
251 
252  pyKeys?.Dispose();
253  pyNames?.Dispose();
254 
255  return result;
256  }
257  }
258 
259  /// <summary>
260  /// Creates a series from a list of <see cref="IndicatorDataPoint"/> and adds it to the
261  /// <see cref="PyDict"/> as the value of the given <paramref name="key"/>
262  /// </summary>
263  /// <param name="key">Key to insert in the <see cref="PyDict"/></param>
264  /// <param name="points">List of <see cref="IndicatorDataPoint"/> that will make up the resulting series</param>
265  /// <param name="pyDict"><see cref="PyDict"/> where the resulting key-value pair will be inserted into</param>
266  private void AddSeriesToPyDict(string key, List<IndicatorDataPoint> points, PyDict pyDict)
267  {
268  var index = new List<DateTime>();
269  var values = new List<double>();
270 
271  foreach (var point in points)
272  {
273  index.Add(point.EndTime);
274  values.Add((double) point.Value);
275  }
276  pyDict.SetItem(key.ToLowerInvariant(), _pandas.Series(values, index));
277  }
278 
279  /// <summary>
280  /// Converts a <see cref="PyDict"/> of string to pandas.Series in a pandas.DataFrame
281  /// </summary>
282  /// <param name="pyDict"><see cref="PyDict"/> of string to pandas.Series</param>
283  /// <returns><see cref="PyObject"/> containing a pandas.DataFrame</returns>
284  private PyObject MakeIndicatorDataFrame(PyDict pyDict)
285  {
286  return _pandas.DataFrame(pyDict, columns: pyDict.Keys().Select(x => x.As<string>().ToLowerInvariant()).OrderBy(x => x));
287  }
288 
289  /// <summary>
290  /// Gets the <see cref="PandasData"/> for the given symbol if it exists in the dictionary, otherwise it creates a new instance with the
291  /// given base data and adds it to the dictionary
292  /// </summary>
293  private PandasData GetPandasDataValue(IDictionary<SecurityIdentifier, PandasData> sliceDataDict, Symbol symbol, object data, ref int maxLevels)
294  {
295  PandasData value;
296  if (!sliceDataDict.TryGetValue(symbol.ID, out value))
297  {
298  sliceDataDict[symbol.ID] = value = new PandasData(data);
299  maxLevels = Math.Max(maxLevels, value.Levels);
300  }
301 
302  return value;
303  }
304 
305  /// <summary>
306  /// Adds each slice data corresponding to the requested data type to the pandas data dictionary
307  /// </summary>
308  private void AddSliceDataTypeDataToDict(Slice slice, bool requestedTick, bool requestedTradeBar, bool requestedQuoteBar, IDictionary<SecurityIdentifier, PandasData> sliceDataDict, ref int maxLevels, Type dataType = null)
309  {
310  HashSet<SecurityIdentifier> _addedData = null;
311 
312  for (int i = 0; i < slice.AllData.Count; i++)
313  {
314  var baseData = slice.AllData[i];
315  var value = GetPandasDataValue(sliceDataDict, baseData.Symbol, baseData, ref maxLevels);
316 
317  if (value.IsCustomData)
318  {
319  value.Add(baseData);
320  }
321  else
322  {
323  var tick = requestedTick ? baseData as Tick : null;
324  if(tick == null)
325  {
326  if (!requestedTradeBar && !requestedQuoteBar && dataType != null && baseData.GetType().IsAssignableTo(dataType))
327  {
328  // support for auxiliary data history requests
329  value.Add(baseData);
330  continue;
331  }
332 
333  // we add both quote and trade bars for each symbol at the same time, because they share the row in the data frame else it will generate 2 rows per series
334  if (requestedTradeBar && requestedQuoteBar)
335  {
336  _addedData ??= new();
337  if (!_addedData.Add(baseData.Symbol.ID))
338  {
339  continue;
340  }
341  }
342 
343  // the slice already has the data organized by symbol so let's take advantage of it using Bars/QuoteBars collections
344  QuoteBar quoteBar = null;
345  var tradeBar = requestedTradeBar ? baseData as TradeBar : null;
346  if (tradeBar != null)
347  {
348  slice.QuoteBars.TryGetValue(tradeBar.Symbol, out quoteBar);
349  }
350  else
351  {
352  quoteBar = requestedQuoteBar ? baseData as QuoteBar : null;
353  if (quoteBar != null)
354  {
355  slice.Bars.TryGetValue(quoteBar.Symbol, out tradeBar);
356  }
357  }
358  value.Add(tradeBar, quoteBar);
359  }
360  else
361  {
362  value.AddTick(tick);
363  }
364  }
365  }
366  }
367  }
368 }