Découverte du SQL

Explorer des données avec SQL

Author

Ludovic Deneuville

🚧

Idées :

évolution du classement
nombre de fois donné par an pour 10000 naissances

1 Définitions

Base de données : Ensemble de tableaux
Table :
- Lignes (individus)
- Colonnes (attributs)
Types de Données :
- Texte (comme un nom, une adresse)
- Nombre (comme l’âge, le prix)
- Date
- Booléen

Pourquoi Utiliser une Base de Données ?

Parce que c’est mieux que d’utiliser du papier
Permet de gérer de grandes quantités d’informations

2 Fichier des prénoms

https://www.insee.fr/fr/statistiques/8205621?sommaire=8205628

Fichier France
Fichier par département de naissance

2.1 Aperçu du fichier national

sexe	preusuel	annais	nombre
1	ANDRÉ	1960	4006
1	DENIS	1960	4658
1	FRÉDÉRIC	1960	4314
1	HERVÉ	1960	4326
1	JEAN-CLAUDE	1960	4030
1	JOËL	1960	4888
1	LAURENT	1960	4275
1	YVES	1960	4485
2	ANNE	1960	4813
2	CLAUDINE	1960	4194
2	ELISABETH	1960	4106
2	EVELYNE	1960	4888
2	LAURENCE	1960	4375
2	MARIE-CHRISTINE	1960	4096
2	MONIQUE	1960	4590
2	NICOLE	1960	4602

3 Lancement

Se connecter au SSPCloud
Ouvrir un service CloudBeaver

4 Exercice

4.1 Initialisation

Créons une vue pointant vers le fichier

CREATE OR REPLACE VIEW prenom_raw AS FROM 'https://www.insee.fr/fr/statistiques/fichier/8205621/prenoms-2023-nat.parquet';

SELECT *
  FROM prenom_raw;

Suppression des données manquantes

CREATE OR REPLACE VIEW prenom
AS
SELECT *
  FROM prenom_raw
 WHERE annais <> 'XXXX'
   AND preusuel <> '_PRENOMS_RARES';

4.2 Top 10 féminin 2023

Le top 10 féminin annoncé pour l’année 2023 est le suivant :

Rang	Prénom	Nombre
1	LOUISE	3177
2	AMBRE	3168
3	ALBA	3088
4	JADE	2891
5	EMMA	2663
6	ROSE	2375
7	ALMA	2342
8	ALICE	2256
9	ROMY	2198
10	ANNA	2129

Vérifions en écrivant une requête

SELECT *
  FROM prenom
 WHERE sexe = 2
   AND annais = '2023'
 ORDER BY nombre DESC
 LIMIT 10;

Quel prénom est classé 11e dans cette liste ?
Est-ce que l’on peut écrire ce prénom autrement ?

SELECT *
  FROM prenom
 WHERE annais = '2023'
   AND STRIP_ACCENTS(preusuel) = 'EVA';

Refaire le vrai classement

SELECT STRIP_ACCENTS(preusuel),
       SUM(nombre) AS nb
  FROM prenom
 WHERE sexe = 2
   AND annais = '2023'
 GROUP BY STRIP_ACCENTS(preusuel)
 ORDER BY nb DESC
 LIMIT 10;

Corriger les données

CREATE OR REPLACE VIEW prenom
AS
SELECT sexe, 
      STRIP_ACCENTS(preusuel) AS preusuel, 
      annais, 
      nombre 
 FROM prenom_raw
WHERE annais <> 'XXXX'
  AND preusuel <> '_PRENOMS_RARES';

### Evolution au cours du temps

 SELECT *
  FROM prenom
 WHERE preusuel = 'LUDOVIC'
 ORDER BY annais;

Faire un diagramme en barre avec matplotlib
Demander à un LLM

Code Python

import duckdb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

con = duckdb.connect(database=':memory:', read_only=False)

parquet_url = 'https://www.insee.fr/fr/statistiques/fichier/8205621/prenoms-2023-nat.parquet'

try:
    con.execute("SET force_download=true")
    
    con.execute(f"""
        CREATE OR REPLACE VIEW prenom_raw AS FROM '{parquet_url}';
    """)

    con.execute(f"""
        CREATE OR REPLACE VIEW prenom
        AS
        SELECT sexe, 
               STRIP_ACCENTS(preusuel) AS preusuel, 
               annais, 
               nombre 
          FROM prenom_raw
         WHERE annais <> 'XXXX'
           AND preusuel <> '_PRENOMS_RARES';
    """)

except duckdb.Error as e:
    print(f"An error occurred with DuckDB: {e}")
    con.close()
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    con.close()

prenom = "LUDOVIC"

try:
    query = f"""
        SELECT annais, nombre
        FROM prenom
        WHERE preusuel = '{prenom}'
          AND annais <> 'XXXX'
          AND annais >= '1950'
        ORDER BY annais;
    """
    result = con.execute(query).fetchdf()

    if not result.empty:
        years = [int(y) for y in result['annais']]
        counts = result['nombre']

        # Create the bar plot
        plt.figure(figsize=(10, 6))
        plt.bar(years, counts, color='skyblue')

        #cmap = LinearSegmentedColormap.from_list("gradient", ["darkgreen", "orange"])
        #norm = plt.Normalize(vmin=min(counts), vmax=max(counts))
        #plt.bar(years, counts, color=cmap(norm(counts)))

        # Add labels and title
        plt.xlabel("Année")
        plt.ylabel("Nombre de fois")
        plt.title(f"Évolution du Prénom {prenom} par Année")

        years_to_show = np.arange(min(years), max(years) + 1, 10)
        plt.xticks(years_to_show, rotation=45, ha='right')
        plt.tight_layout()

        # Show the plot
        plt.grid(axis='y', linestyle='--')
        plt.show()

    else:
        print("No data found for the first name 'LUDOVIC'.")

except duckdb.Error as e:
    print(f"An error occurred with DuckDB: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

4.3 Prénoms les plus donnés chaque année

WITH ranked_prenoms AS (
    SELECT
        sexe,
        preusuel,
        annais,
        nombre,
        RANK() OVER (PARTITION BY annais ORDER BY nombre DESC) as rnk
    FROM prenom
   WHERE sexe = 2 AND annais >= '1950'
)
SELECT annais,
       preusuel
  FROM ranked_prenoms
 WHERE rnk = 1
 ORDER BY annais;

4.4 Différents prénoms

CREATE OR REPLACE VIEW differents
AS
SELECT DISTINCT(preusuel)
  FROM prenom;

SELECT COUNT(1) 
 FROM differents;

SELECT *
 FROM differents
WHERE preusuel LIKE '%A%B%C%';


SELECT *
 FROM differents
WHERE preusuel LIKE '%W%W%';

4.5 Prénoms mixtes

SELECT preusuel,
       SUM(nombre) FILTER (WHERE sexe = 1) AS masculin,
       SUM(nombre) FILTER (WHERE sexe = 2) AS feminin
  FROM prenom
 GROUP BY preusuel
HAVING masculin > 1000 AND feminin > 1000
 ORDER BY masculin + feminin DESC

Code Python

prenom = "ALIX"

try:
    query = f"""
        SELECT preusuel,
               annais,
               SUM(nombre) FILTER (WHERE sexe = 1) AS masculin,
               SUM(nombre) FILTER (WHERE sexe = 2) AS feminin
          FROM prenom
         WHERE preusuel = '{prenom}'
           AND annais <> 'XXXX'
           AND annais >= '1950'
         GROUP BY preusuel,
                  annais
         ORDER BY annais
    """
    result = con.execute(query).fetchdf()

    if not result.empty:

        years = [int(y) for y in result['annais']]
        countf = result['feminin']
        countm = result['masculin']
        
        # Create the bar plot
        plt.figure(figsize=(10, 6))
        
        plt.bar([y+0.2 for y in years], countf, color='lightcoral', label='Femmes')
        plt.bar(years, countm, color='lightskyblue', label='Hommes')
        
        # Add labels and title
        plt.xlabel("Année")
        plt.ylabel("Nombre de fois")
        plt.title(f"Évolution du Prénom {prenom} par Année")
        
        years_to_show = np.arange(min(years), max(years) + 1, 10)
        plt.xticks(years_to_show, rotation=45, ha='right')
        plt.tight_layout()
        
        # Add a legend
        plt.legend()
        
        # Show the plot
        plt.grid(axis='y', linestyle='--')
        plt.show()


except duckdb.Error as e:
    print(f"An error occurred with DuckDB: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

4.6 Première lettre

SELECT annais,
       preusuel[1],
       SUM(nombre) AS nb
  FROM prenom
 WHERE annais <> 'XXXX'
   AND annais = '2023'
   AND sexe = 2
 GROUP BY annais,
          preusuel[1]
 ORDER BY annais,
          nb DESC

WITH ranked_prenoms AS (
    SELECT
        annais,
        preusuel[1] AS preusuel,
        SUM(nombre) AS nb,
        sexe,
        ROW_NUMBER() OVER (PARTITION BY annais, sexe ORDER BY SUM(nombre) DESC) AS rn
    FROM
        prenom
    WHERE
        annais <> 'XXXX'
    GROUP BY
        annais,
        preusuel[1],
        sexe
)
SELECT
    annais,
    MAX(CASE WHEN sexe = 1 THEN preusuel END) AS maculin,
    MAX(CASE WHEN sexe = 2 THEN preusuel END) AS feminin
FROM
    ranked_prenoms
WHERE
    rn = 1
GROUP BY
    annais
ORDER BY
    annais;

4.7 Par départements

CREATE OR REPLACE VIEW prenom_dep_raw AS FROM 'https://www.insee.fr/fr/statistiques/fichier/8205621/prenoms-2023-dpt.parquet';

SELECT *
  FROM prenom_dep_raw;

CREATE OR REPLACE VIEW prenom_dep
AS
SELECT sexe, 
       STRIP_ACCENTS(preusuel) AS preusuel, 
       annais, 
       dpt,
       nombre 
  FROM prenom_dep_raw
 WHERE annais <> 'XXXX'
   AND dpt <> 'XX'
   AND preusuel <> '_PRENOMS_RARES';