跳到主要内容

JSON 解析完全指南

Spark SQL 中处理 JSON 数据的所有姿势


SQL 专题版(原理讲解)

一、基础取值

get_json_object —— 取单个字段

-- 语法:get_json_object(json_string, path)
-- path 以 $ 开头,用 . 访问对象,用 [n] 访问数组

SELECT get_json_object('{"name":"张三","age":28}', '$.name');
-- 结果:张三

SELECT get_json_object('{"name":"张三","age":28}', '$.age');
-- 结果:28(注意:返回类型始终是 STRING)

注意事项

  • 返回值永远是 STRING,需要用 CAST 转类型
  • 每次只能取一个字段,取多个字段需调用多次(性能差)
  • path 不存在时返回 NULL

json_tuple —— 一次取多个字段

-- 一次解析多个字段,比多次 get_json_object 性能好
SELECT
j.name,
j.age,
j.city
FROM source_table
LATERAL VIEW json_tuple(json_col, 'name', 'age', 'city') j AS name, age, city;

vs get_json_object

  • json_tuple 只解析一次 JSON,性能更优
  • 但 json_tuple 只能取第一层字段,不支持嵌套路径
  • 需要配合 LATERAL VIEW 使用

from_json —— 转结构体(最强大)

-- 定义 schema,将 JSON 字符串转为结构体
SELECT
from_json(
'{"name":"张三","age":28,"scores":[90,85,92]}',
'struct<name:string, age:int, scores:array<int>>'
) AS parsed;

-- 转为结构体后可以直接用 . 取字段
SELECT
parsed.name,
parsed.age,
parsed.scores[0]
FROM (
SELECT from_json(json_col, 'struct<name:string, age:int, scores:array<int>>') AS parsed
FROM source_table
) t;

优势:类型安全,后续操作直接用结构体语法,无需反复解析。


二、数组处理

explode —— 展开 JSON 数组

-- 场景:一行包含多个元素的 JSON 数组,需要展开为多行

-- 原始数据:{"user":"张三","tags":["大数据","SQL","Python"]}

-- Step 1:取出数组字符串
-- Step 2:转为 array 类型
-- Step 3:explode 展开

SELECT
get_json_object(json_col, '$.user') AS user,
tag
FROM source_table
LATERAL VIEW explode(
from_json(get_json_object(json_col, '$.tags'), 'array<string>')
) t AS tag;

-- 结果:
-- 张三 大数据
-- 张三 SQL
-- 张三 Python

posexplode —— 带索引展开

-- 保留元素在数组中的位置
SELECT
user_id,
pos, -- 索引,从 0 开始
item
FROM source_table
LATERAL VIEW posexplode(
from_json(get_json_object(json_col, '$.items'), 'array<string>')
) t AS pos, item;

数组内取指定元素

-- 取数组第 N 个元素(0-based)
SELECT get_json_object(json_col, '$.scores[0]') AS first_score;
SELECT get_json_object(json_col, '$.scores[1]') AS second_score;

-- 取数组长度
SELECT size(from_json(get_json_object(json_col, '$.scores'), 'array<int>')) AS arr_len;

-- 取数组最后一个元素
SELECT
scores[size(scores) - 1] AS last_score
FROM (
SELECT from_json(get_json_object(json_col, '$.scores'), 'array<int>') AS scores
FROM source_table
) t;

三、嵌套解析

多层嵌套对象

-- 数据:{"user":{"name":"张三","address":{"city":"北京","district":"海淀"}}}

-- 方式一:get_json_object 直接用路径
SELECT
get_json_object(json_col, '$.user.name') AS name,
get_json_object(json_col, '$.user.address.city') AS city,
get_json_object(json_col, '$.user.address.district') AS district;

-- 方式二:from_json 定义完整 schema(推荐)
SELECT
parsed.user.name,
parsed.user.address.city,
parsed.user.address.district
FROM (
SELECT from_json(json_col,
'struct<user:struct<name:string, address:struct<city:string, district:string>>>'
) AS parsed
FROM source_table
) t;

数组嵌套对象

-- 数据:{"orders":[{"id":1,"amount":100},{"id":2,"amount":200}]}

SELECT
order_item.id,
order_item.amount
FROM source_table
LATERAL VIEW explode(
from_json(
get_json_object(json_col, '$.orders'),
'array<struct<id:int, amount:double>>'
)
) t AS order_item;

-- 结果:
-- 1 100.0
-- 2 200.0

对象嵌套数组

-- 数据:{"user":"张三","courses":[{"name":"数学","scores":[90,85,88]},{"name":"语文","scores":[78,82,80]}]}

SELECT
get_json_object(json_col, '$.user') AS user,
course.name AS course_name,
score
FROM source_table
LATERAL VIEW explode(
from_json(
get_json_object(json_col, '$.courses'),
'array<struct<name:string, scores:array<int>>>'
)
) t1 AS course
LATERAL VIEW explode(course.scores) t2 AS score;

-- 结果:
-- 张三 数学 90
-- 张三 数学 85
-- 张三 数学 88
-- 张三 语文 78
-- ...

四、Map 操作

str_to_map —— 字符串转 Map

-- 场景:KV 格式字符串 "k1=v1,k2=v2,k3=v3"
SELECT
kv_map['k1'] AS v1,
kv_map['k2'] AS v2
FROM (
SELECT str_to_map('k1=v1,k2=v2,k3=v3', ',', '=') AS kv_map
) t;

Map 常用操作

-- map_keys:获取所有 key
SELECT map_keys(str_to_map('a=1,b=2,c=3', ',', '='));
-- 结果:["a","b","c"]

-- map_values:获取所有 value
SELECT map_values(str_to_map('a=1,b=2,c=3', ',', '='));
-- 结果:["1","2","3"]

-- element_at:按 key 取值(比 [] 更安全,key 不存在返回 NULL)
SELECT element_at(map_col, 'target_key');

-- map_from_arrays:两个数组组合为 Map
SELECT map_from_arrays(array('a','b','c'), array(1,2,3));

-- explode Map 为多行
SELECT key, value
FROM source_table
LATERAL VIEW explode(str_to_map(kv_col, ',', '=')) t AS key, value;

五、结构体操作

-- named_struct:构造结构体
SELECT named_struct('name', '张三', 'age', 28, 'city', '北京') AS info;

-- 结构体取字段
SELECT info.name, info.age FROM (
SELECT named_struct('name', '张三', 'age', 28) AS info
) t;

-- 结构体数组
SELECT
collect_list(named_struct('id', id, 'name', name)) AS user_list
FROM users
GROUP BY dept;

六、Schema 推断

-- schema_of_json:自动推断 JSON 的 schema(开发调试用)
SELECT schema_of_json('{"name":"张三","age":28,"scores":[90,85]}');
-- 结果:struct<name:string,age:bigint,scores:array<bigint>>

-- 用推断结果直接传给 from_json
SELECT from_json(json_col, schema_of_json('{"name":"张三","age":28}'))
FROM source_table;

注意:schema_of_json 基于单条样本推断,生产环境建议手动定义 schema 以确保类型准确。


七、异常处理

JSON 格式不合法

-- get_json_object 对非法 JSON 返回 NULL
SELECT get_json_object('not a json', '$.name');
-- 结果:NULL

-- 过滤非法 JSON 行
SELECT *
FROM source_table
WHERE get_json_object(json_col, '$') IS NOT NULL;

-- 用 try_to_number 等 try_ 系列函数安全转换
SELECT
CAST(get_json_object(json_col, '$.age') AS INT) -- 非数字会报错
-- vs
TRY_CAST(get_json_object(json_col, '$.age') AS INT) -- 非数字返回 NULL(Spark 3.4+)

字段缺失

-- 字段不存在时返回 NULL,用 COALESCE 给默认值
SELECT
COALESCE(get_json_object(json_col, '$.nickname'), '未知') AS nickname;

类型不匹配

-- from_json schema 与实际数据不匹配时,整个结构体返回 NULL
-- 建议:先用 schema_of_json 确认实际类型,再定义 schema

-- 常见坑:JSON 中 age 有时是 "28"(字符串),有时是 28(数字)
-- 解决:schema 中用 string 接收,后续 CAST
SELECT CAST(parsed.age AS INT)
FROM (
SELECT from_json(json_col, 'struct<age:string>') AS parsed
FROM source_table
) t;

八、JSON 写出

-- to_json:结构体/Map 转 JSON 字符串
SELECT to_json(named_struct('name', '张三', 'age', 28));
-- 结果:{"name":"张三","age":28}

-- 数组转 JSON
SELECT to_json(array(1, 2, 3));
-- 结果:[1,2,3]

-- Map 转 JSON
SELECT to_json(map('name', '张三', 'city', '北京'));
-- 结果:{"name":"张三","city":"北京"}

-- 聚合后输出 JSON 数组
SELECT
dept,
to_json(collect_list(named_struct('name', name, 'salary', salary))) AS employees_json
FROM employees
GROUP BY dept;

九、实战场景

场景一:埋点日志解析

-- 原始日志:{"event":"click","ts":1700000000,"props":{"page":"home","button_id":"buy_btn","ext":{"ab_test":"v2"}}}

SELECT
get_json_object(log, '$.event') AS event_name,
from_unixtime(CAST(get_json_object(log, '$.ts') AS BIGINT)) AS event_time,
get_json_object(log, '$.props.page') AS page,
get_json_object(log, '$.props.button_id') AS button_id,
get_json_object(log, '$.props.ext.ab_test') AS ab_test
FROM ods_event_log
WHERE dt = '${bizdate}';

场景二:接口返回值解析

-- API 响应:{"code":200,"data":{"list":[{"id":1,"name":"商品A"},{"id":2,"name":"商品B"}],"total":100}}

SELECT
get_json_object(response, '$.code') AS code,
get_json_object(response, '$.data.total') AS total,
item.id,
item.name
FROM api_response_table
LATERAL VIEW explode(
from_json(
get_json_object(response, '$.data.list'),
'array<struct<id:int, name:string>>'
)
) t AS item
WHERE get_json_object(response, '$.code') = '200';

场景三:动态 KV 属性解析

-- 用户属性存储为动态 JSON:{"vip_level":"3","register_channel":"wechat","preferences":"sports,tech"}

-- 方式一:已知要取哪些 key
SELECT
user_id,
get_json_object(attrs, '$.vip_level') AS vip_level,
get_json_object(attrs, '$.register_channel') AS channel
FROM user_profile;

-- 方式二:将动态 KV 全部展开为行
SELECT
user_id,
attr_key,
attr_value
FROM user_profile
LATERAL VIEW explode(
from_json(attrs, 'map<string,string>')
) t AS attr_key, attr_value;

实用工具版(速查模板)

快速对照表

需求用什么示例
取单个字段get_json_object(col, '$.key')get_json_object(j, '$.name')
取多个一级字段json_tuple + LATERAL VIEW见下方
取嵌套字段get_json_object(col, '$.a.b.c')get_json_object(j, '$.user.addr.city')
JSON→结构体from_json(col, schema)见下方
展开 JSON 数组explode(from_json(..., 'array<...>'))见下方
带索引展开posexplode(...)posexplode(arr) AS pos, item
取数组第N个get_json_object(col, '$.arr[0]')0-based 索引
KV 字符串→Mapstr_to_map(col, ',', '=')str_to_map('a=1,b=2',',','=')
Map 取值element_at(map, key)map['key']
结构体→JSONto_json(struct)
推断 Schemaschema_of_json(sample)开发调试用

常用模板

-- ① 取单个嵌套字段
get_json_object(json_col, '$.level1.level2.field')

-- ② 一次取多个一级字段
SELECT j.*
FROM table LATERAL VIEW json_tuple(json_col, 'f1', 'f2', 'f3') j AS f1, f2, f3;

-- ③ 展开 JSON 对象数组
SELECT item.field1, item.field2
FROM table
LATERAL VIEW explode(from_json(get_json_object(json_col, '$.array_field'), 'array<struct<field1:type1, field2:type2>>')) t AS item;

-- ④ 两层展开(数组套数组)
SELECT outer_item.name, inner_val
FROM table
LATERAL VIEW explode(from_json(get_json_object(json_col, '$.outer'), 'array<struct<name:string, inner:array<int>>>')) t1 AS outer_item
LATERAL VIEW explode(outer_item.inner) t2 AS inner_val;

-- ⑤ 动态 KV 展开为行
SELECT key, value
FROM table
LATERAL VIEW explode(from_json(json_col, 'map<string,string>')) t AS key, value;

-- ⑥ JSON 格式校验(过滤脏数据)
WHERE get_json_object(json_col, '$') IS NOT NULL

-- ⑦ 安全取值 + 默认值
COALESCE(get_json_object(json_col, '$.field'), '默认值')

-- ⑧ 取数组长度
size(from_json(get_json_object(json_col, '$.arr'), 'array<string>'))