# Créer des données d'exemple
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from datetime import datetime, timedelta
spark = SparkSession.builder \
.appName("FeaturePipeline") \
.master("local[*]") \
.config("spark.sql.adaptive.enabled", "true") \
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
# Données de transactions
transactions_data = [
("C001", "TXN001", 150.0, "Electronics", "2024-01-15"),
("C001", "TXN002", 25.0, "Food", "2024-01-18"),
("C001", "TXN003", 200.0, "Electronics", "2024-01-25"),
("C002", "TXN004", 75.0, "Clothing", "2024-01-10"),
("C002", "TXN005", 50.0, "Food", "2024-01-20"),
("C003", "TXN006", 500.0, "Electronics", "2024-01-05"),
("C003", "TXN007", 30.0, "Food", "2024-01-08"),
("C003", "TXN008", 120.0, "Clothing", "2024-01-22"),
("C001", "TXN009", 80.0, "Food", "2024-02-01"),
("C002", "TXN010", 300.0, "Electronics", "2024-02-05"),
]
transactions_schema = StructType([
StructField("customer_id", StringType(), False),
StructField("transaction_id", StringType(), False),
StructField("amount", DoubleType(), False),
StructField("category", StringType(), False),
StructField("transaction_date", StringType(), False),
])
transactions_df = spark.createDataFrame(transactions_data, transactions_schema) \
.withColumn("transaction_date", F.to_date("transaction_date"))
print("📦 Transactions brutes :")
transactions_df.show()