import os
import shutil
# Créer des données plus riches
orders_extended = spark.createDataFrame([
# Customer 101 - Premium, 3 commandes
(1, 101, "2024-01-01", 150.0, "Premium"),
(2, 101, "2024-01-15", 200.0, "Premium"),
(3, 101, "2024-02-01", 180.0, "Premium"),
# Customer 102 - Standard, 4 commandes
(4, 102, "2024-01-05", 300.0, "Standard"),
(5, 102, "2024-01-20", 250.0, "Standard"),
(6, 102, "2024-02-10", 275.0, "Standard"),
(7, 102, "2024-03-01", 320.0, "Standard"),
# Customer 103 - Premium, 2 commandes
(8, 103, "2024-01-10", 400.0, "Premium"),
(9, 103, "2024-02-15", 450.0, "Premium"),
# Customer 104 - Standard, 3 commandes
(10, 104, "2024-01-03", 100.0, "Standard"),
(11, 104, "2024-01-25", 120.0, "Standard"),
(12, 104, "2024-02-20", 90.0, "Standard"),
# Customer 105 - Premium, 1 commande
(13, 105, "2024-02-01", 500.0, "Premium"),
], ["order_id", "customer_id", "order_date", "amount", "segment"])
orders_extended = orders_extended.withColumn("order_date", to_date(col("order_date")))
orders_extended.createOrReplaceTempView("orders_ext")
print("📦 Données source:")
orders_extended.show()