[3.13] gh-121130: Fix f-string format specifiers with debug expressions (GH-121150) (#121868)

(cherry picked from commit c46d64e0ef)
2024-07-16 20:25:04 +01:00 · 2024-07-16 20:25:04 +01:00 · d24ec8a47c
parent 06d76c4b94
commit d24ec8a47c
8 changed files with 4044 additions and 1023 deletions
--- a/Doc/library/ast.rst
+++ b/Doc/library/ast.rst
@ -316,9 +316,7 @@ Literals
                            args=[
                                Name(id='a', ctx=Load())]),
                        conversion=-1,
-                        format_spec=JoinedStr(
-                            values=[
-                                Constant(value='.3')]))]))
+                        format_spec=Constant(value='.3'))]))


 .. class:: List(elts, ctx)
--- a/Lib/test/test_ast.py
+++ b/Lib/test/test_ast.py
--- a/Lib/test/test_fstring.py
+++ b/Lib/test/test_fstring.py
@ -8,6 +8,7 @@
 # Unicode identifiers in tests is allowed by PEP 3131.

 import ast
+import datetime
 import dis
 import os
 import re
@ -1602,6 +1603,12 @@ x = (
        self.assertEqual(f'{f(a=4)}', '3=')
        self.assertEqual(x, 4)

+        # Check debug expressions in format spec
+        y = 20
+        self.assertEqual(f"{2:{y=}}", "yyyyyyyyyyyyyyyyyyy2")
+        self.assertEqual(f"{datetime.datetime.now():h1{y=}h2{y=}h3{y=}}",
+                         'h1y=20h2y=20h3y=20')
+
        # Make sure __format__ is being called.
        class C:
            def __format__(self, s):
@ -1615,9 +1622,11 @@ x = (
        self.assertEqual(f'{C()=: }', 'C()=FORMAT- ')
        self.assertEqual(f'{C()=:x}', 'C()=FORMAT-x')
        self.assertEqual(f'{C()=!r:*^20}', 'C()=********REPR********')
+        self.assertEqual(f"{C():{20=}}", 'FORMAT-20=20')

        self.assertRaises(SyntaxError, eval, "f'{C=]'")

+
        # Make sure leading and following text works.
        x = 'foo'
        self.assertEqual(f'X{x=}Y', 'Xx='+repr(x)+'Y')
--- a/Builtins/2024-06-29-10-46-14.gh-issue-121130.Rj66Xs.rst
+++ b/Builtins/2024-06-29-10-46-14.gh-issue-121130.Rj66Xs.rst
@ -0,0 +1,2 @@
+Fix f-strings with debug expressions in format specifiers. Patch by Pablo
+Galindo
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@ -961,6 +961,8 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv)
    return result_token_with_metadata(p, conv, conv_token->metadata);
 }

+static asdl_expr_seq *
+unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions);
 ResultTokenWithMetadata *
 _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset,
                                int end_lineno, int end_col_offset, PyArena *arena)
@ -999,8 +1001,15 @@ _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, in
        assert(j == non_empty_count);
        spec = resized_spec;
    }
-    expr_ty res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
-                                   end_col_offset, p->arena);
+    expr_ty res;
+    if (asdl_seq_LEN(spec) == 0) {
+        res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
+                                    end_col_offset, p->arena);
+    } else {
+        res = _PyPegen_concatenate_strings(p, spec,
+                             lineno, col_offset, end_lineno,
+                             end_col_offset, arena);
+    }
    if (!res) {
        return NULL;
    }
@ -1300,6 +1309,7 @@ unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions)

 expr_ty
 _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) {
+
    asdl_expr_seq *expr = unpack_top_level_joined_strs(p, raw_expressions);
    Py_ssize_t n_items = asdl_seq_LEN(expr);

@ -1464,7 +1474,6 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, Re
            debug_end_offset = end_col_offset;
            debug_metadata = closing_brace->metadata;
        }
-
        expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line,
                                             debug_end_offset - 1, p->arena);
        if (!debug_text) {
@ -1497,16 +1506,23 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
    Py_ssize_t n_flattened_elements = 0;
    for (i = 0; i < len; i++) {
        expr_ty elem = asdl_seq_GET(strings, i);
-        if (elem->kind == Constant_kind) {
-            if (PyBytes_CheckExact(elem->v.Constant.value)) {
-                bytes_found = 1;
-            } else {
-                unicode_string_found = 1;
-            }
-            n_flattened_elements++;
-        } else {
-            n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
-            f_string_found = 1;
+        switch(elem->kind) {
+            case Constant_kind:
+                if (PyBytes_CheckExact(elem->v.Constant.value)) {
+                    bytes_found = 1;
+                } else {
+                    unicode_string_found = 1;
+                }
+                n_flattened_elements++;
+                break;
+            case JoinedStr_kind:
+                n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
+                f_string_found = 1;
+                break;
+            default:
+                n_flattened_elements++;
+                f_string_found = 1;
+                break;
        }
    }

@ -1548,16 +1564,19 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
    Py_ssize_t j = 0;
    for (i = 0; i < len; i++) {
        expr_ty elem = asdl_seq_GET(strings, i);
-        if (elem->kind == Constant_kind) {
-            asdl_seq_SET(flattened, current_pos++, elem);
-        } else {
-            for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
-                expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
-                if (subvalue == NULL) {
-                    return NULL;
+        switch(elem->kind) {
+            case JoinedStr_kind:
+                for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
+                    expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
+                    if (subvalue == NULL) {
+                        return NULL;
+                    }
+                    asdl_seq_SET(flattened, current_pos++, subvalue);
                }
-                asdl_seq_SET(flattened, current_pos++, subvalue);
-            }
+                break;
+            default:
+                asdl_seq_SET(flattened, current_pos++, elem);
+                break;
        }
    }

--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@ -989,6 +989,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
        the_current_tok->last_expr_buffer = NULL;
        the_current_tok->last_expr_size = 0;
        the_current_tok->last_expr_end = -1;
+        the_current_tok->in_format_spec = 0;
        the_current_tok->f_string_debug = 0;

        switch (*tok->start) {
@ -1137,15 +1138,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         * by the `{` case, so for ensuring that we are on the 0th level, we need
         * to adjust it manually */
        int cursor = current_tok->curly_bracket_depth - (c != '{');
-        if (cursor == 0 && !_PyLexer_update_fstring_expr(tok, c)) {
+        int in_format_spec = current_tok->in_format_spec;
+         int cursor_in_format_with_debug =
+             cursor == 1 && (current_tok->f_string_debug || in_format_spec);
+         int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
+        if ((cursor_valid) && !_PyLexer_update_fstring_expr(tok, c)) {
            return MAKE_TOKEN(ENDMARKER);
        }
-        if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
+        if ((cursor_valid) && c != '{' && set_fstring_expr(tok, token, c)) {
            return MAKE_TOKEN(ERRORTOKEN);
        }

        if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
            current_tok->kind = TOK_FSTRING_MODE;
+            current_tok->in_format_spec = 1;
            p_start = tok->start;
            p_end = tok->cur;
            return MAKE_TOKEN(_PyToken_OneChar(c));
@ -1235,6 +1241,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
            if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
                current_tok->curly_bracket_expr_start_depth--;
                current_tok->kind = TOK_FSTRING_MODE;
+                current_tok->in_format_spec = 0;
                current_tok->f_string_debug = 0;
            }
        }
@ -1317,11 +1324,11 @@ f_string_middle:
    tok->multi_line_start = tok->line_start;
    while (end_quote_size != current_tok->f_string_quote_size) {
        int c = tok_nextc(tok);
-        if (tok->done == E_ERROR) {
+        if (tok->done == E_ERROR || tok->done == E_DECODE) {
            return MAKE_TOKEN(ERRORTOKEN);
        }
        int in_format_spec = (
-                current_tok->last_expr_end != -1
+                current_tok->in_format_spec
                &&
                INSIDE_FSTRING_EXPR(current_tok)
        );
@ -1337,6 +1344,7 @@ f_string_middle:
            if (in_format_spec && c == '\n') {
                tok_backup(tok, c);
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                p_start = tok->start;
                p_end = tok->cur;
                return MAKE_TOKEN(FSTRING_MIDDLE);
@ -1378,6 +1386,9 @@ f_string_middle:
        }

        if (c == '{') {
+            if (!_PyLexer_update_fstring_expr(tok, c)) {
+                return MAKE_TOKEN(ENDMARKER);
+            }
            int peek = tok_nextc(tok);
            if (peek != '{' || in_format_spec) {
                tok_backup(tok, peek);
@ -1387,6 +1398,7 @@ f_string_middle:
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
                }
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                p_start = tok->start;
                p_end = tok->cur;
            } else {
@ -1406,13 +1418,15 @@ f_string_middle:
            // scanning (indicated by the end of the expression being set) and we are not at the top level
            // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
            // brackets, we can bypass it here.
-            if (peek == '}' && !in_format_spec) {
+            int cursor = current_tok->curly_bracket_depth;
+            if (peek == '}' && !in_format_spec && cursor == 0) {
                p_start = tok->start;
                p_end = tok->cur - 1;
            } else {
                tok_backup(tok, peek);
                tok_backup(tok, c);
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
+                current_tok->in_format_spec = 0;
                p_start = tok->start;
                p_end = tok->cur;
            }
--- a/Parser/lexer/state.c
+++ b/Parser/lexer/state.c
@ -74,6 +74,7 @@ free_fstring_expressions(struct tok_state *tok)
            mode->last_expr_buffer = NULL;
            mode->last_expr_size = 0;
            mode->last_expr_end = -1;
+            mode->in_format_spec = 0;
        }
    }
 }
--- a/Parser/lexer/state.h
+++ b/Parser/lexer/state.h
@ -58,6 +58,7 @@ typedef struct _tokenizer_mode {
    Py_ssize_t last_expr_end;
    char* last_expr_buffer;
    int f_string_debug;
+    int in_format_spec;
 } tokenizer_mode;

 /* Tokenizer state */