Hi all,
Not sure if this is considered to be a bug, or not…
While testing my Discord Bot with OpenAI API integration I encountered a curious behaviour:
When issuing the simple query “test”, the API is yielding the following piece of code:
'_data.csv\')\n\n# Split the data into features and target label\nincome_raw = data[\'income\']\nfeatures_raw = data.drop(\'income\', axis = 1)\n\n# Visualize skewed continuous features of original data\nvs.distribution(data)\n\n# Log-transform the skewed features\nskewed = [\'capital-gain\', \'capital-loss\']\nfeatures_log_transformed = pd.DataFrame(data = features_raw)\nfeatures_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))\n\n# Visualize the new log distributions\nvs.distribution(features_log_transformed, transformed = True)\n\n# Normalize numerical features\nscaler = MinMaxScaler() # default=(0, 1)\nnumerical = [\'age\', \'education-num\', \'capital-gain\', \'capital-loss\', \'hours-per-week\']\n\nfeatures_log_minmax_transform = pd.DataFrame(data = features_log_transformed)\nfeatures_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])\n\n# Show an example of a record with scaling applied\ndisplay(features_log_minmax_transform.head(n = 5))\n\n# One-hot encode the \'features_log_minmax_transform\' data using pandas.get_dummies()\nfeatures_final = pd.get_dummies(features_log_minmax_transform)\n\n# Encode the \'income_raw\' data to numerical values\nincome = income_raw.apply(lambda x: 1 if x == \'>50K\' else 0)\n\n# Print the number of features after one-hot encoding\nencoded = list(features_final.columns)\nprint("{} total features after one-hot encoding.".format(len(encoded)))\n\n# Uncomment the following line to see the encoded feature names\n# print encoded\n\n# Import train_test_split\nfrom sklearn.model_selection import train_test_split\n\n# Split the \'features\' and \'income\' data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(features_final, \n income, \n test_size = 0.2, \n random_state = 0)\n\n# Show the results of the split\nprint("Training set has {} samples.".format(X_train.shape[0]))\nprint("Testing set has {} samples.".format(X_test.shape[0]))\n\n# Import two metrics from sklearn - fbeta_score and accuracy_score\nfrom sklearn.metrics import fbeta_score, accuracy_score\n\ndef train_predict(learner, sample_size, X_train, y_train, X_test, y_test): \n \'\'\'\n inputs:\n - learner: the learning algorithm to be trained and predicted on\n - sample_size: the size of samples (number) to be drawn from training set\n - X_train: features training set\n - y_train: income training set\n - X_test: features testing set\n - y_test: income testing set\n \'\'\'\n \n results = {}\n \n # Fit the learner to the training data using slicing with \'sample_size\' using .fit(training_features[:], training_labels[:])\n start = time() # Get start time\n learner = learner.fit(X_train[:sample_size], y_train[:sample_size])\n end = time() # Get end time\n \n # Calculate the training time\n results[\'train_time\'] = end - start\n \n # Get the predictions on the test set(X_test),\n # then get predictions on the first 300 training samples(X_train) using .predict()\n start = time() # Get start time\n predictions_test = learner.predict(X_test)\n predictions_train = learner.predict(X_train[:300])\n end = time() # Get end time\n \n # Calculate the total prediction time\n results[\'pred_time\'] = end - start\n \n # Comp'
Following parameters are being used:
model='text-davinci-003', prompt=prompt, temperature=0.3,