R软件与多股票波动性及相关性的可视化

前言

在量化投资建模过程之前,有时候,我们需要对多只股票的价格走势、收益率序列、波动率等进行分析。下面给出使用 R 语言比较多只股票价格走势的完整解决方案。方案涵盖数据获取、清洗、可视化及基础分析全流程:

数据获取

安装与加载工具包

# 安装必要包(首次运行需取消注释)
# install.packages(c("quantmod", 
#                    "tidyverse", 
#                    "ggplot2", 
#                    "zoo", 
#                    "corrplot"))

library(quantmod)   # 获取金融数据
library(tidyverse)  # 数据处理
library(ggplot2)    # 可视化
library(zoo)        # 时间序列处理

定义股票代码与时间范围

# 股票代码列表(支持多市场,如A股需加 .SS/.SZ)
# 苹果、谷歌、微软、英伟达
stocks <- c("AAPL", "GOOGL", "MSFT", "NVDA")  
# 时间范围
start_date <- "2023-01-01"
end_date <- Sys.Date()  # 获取当前日期

批量获取股票数据

# 获取数据
getSymbols(stocks, 
           src = "yahoo", 
           from = start_date, 
           to = end_date)
## [1] "AAPL"  "GOOGL" "MSFT"  "NVDA"
# 处理数据
stock_data <- lapply(stocks, function(x) {
  data <- as_tibble(get(x)) %>%
    mutate(Date = index(get(x))) %>%
    rename_with(~ gsub(paste0("^", x, "\\."), "", .x)) %>%
    select(Date, Close) %>%
    mutate(symbol = x) %>%  # 添加股票代码列
    rename(price = Close)   # 重命名收盘价列
}) %>%
  bind_rows()

# 查看结果
head(stock_data)
## # A tibble: 6 × 3
##   Date       price symbol
##   <date>     <dbl> <chr> 
## 1 2023-01-03  125. AAPL  
## 2 2023-01-04  126. AAPL  
## 3 2023-01-05  125. AAPL  
## 4 2023-01-06  130. AAPL  
## 5 2023-01-09  130. AAPL  
## 6 2023-01-10  131. AAPL

数据清洗

处理缺失值

library(dplyr)
# 检查缺失值
missing_values <- stock_data %>%
  group_by(symbol) %>%
  summarise(missing = sum(is.na(price)))

# 填充缺失值(使用前向填充)
stock_data <- stock_data %>%
  group_by(symbol) %>%
  mutate(price = na.locf(price))

对齐时间序列

library(dplyr)
# 生成完整日期序列
full_dates <- tibble(Date = seq(as.Date(start_date), 
                                as.Date(end_date), 
                                by = "day"))

# 左连接填充所有日期
stock_data <- full_dates %>%
  left_join(stock_data, by = "Date") %>%
  group_by(symbol) %>%
  fill(price, .direction = "downup") %>%
  na.omit()

价格走势可视化

基础折线图

library(dplyr)
ggplot(stock_data, aes(x = Date, y = price, color = symbol)) +
  geom_line(linewidth = 0.8) +
  labs(title = "多只股票价格走势对比",
       x = "日期",
       y = "收盘价",
       color = "股票代码") +
  theme_minimal() +
  theme(legend.position = "top") +  
  scale_color_manual(values = c("AAPL" = "red", 
                                "GOOGL" = "blue", 
                                "MSFT" = "green", 
                                "NVDA" = "purple")
                     )

对数收益率对比

library(dplyr)
# 计算对数收益率
return_data <- stock_data %>%
  group_by(symbol) %>%
  mutate(log_return = log(price) - log(lag(price))) %>%
  na.omit()

# 绘制收益率曲线
ggplot(return_data, 
       aes(x = Date, y = log_return, color = symbol)) +
  geom_line(alpha = 0.7) +
  labs(title = "对数收益率对比",
       x = "日期",
       y = "对数收益率",
       color = "股票代码") +
  theme_minimal() + 
  theme(legend.position = "top") # 图例放底部

绘制对数收益率密度图:

library(dplyr)
ggplot(return_data, aes(x = log_return, fill = symbol)) +
  geom_density(alpha = 0.4) +  # 半透明填充
  facet_wrap(~ symbol, ncol = 2) +  # 按股票分面显示
  labs(title = "对数收益率密度分布对比",
       x = "对数收益率",
       y = "密度") +
  theme_minimal() +
  theme(legend.position = "top")  # 图例放底部

将密度图叠加以便于比较:

library(dplyr)
# 对数收益率密度图(叠加显示)
ggplot(return_data, aes(x = log_return, fill = symbol, color = symbol)) +
  geom_density(alpha = 0.3, linewidth = 1) +  
  scale_fill_manual(values = c("AAPL" = "#FF5252", 
                               "GOOGL" = "#4285F4", 
                               "MSFT" = "#00A4EF", 
                               "NVDA" = "#7FBA00")) +
  scale_color_manual(values = c("AAPL" = "#D50000", 
                                "GOOGL" = "#0D47A1", 
                                "MSFT" = "#005A8E", 
                                "NVDA" = "#527D00")) 
  labs(title = "对数收益率密度分布对比",
       x = "对数收益率",
       y = "密度",
       fill = "股票代码",
       color = "股票代码") +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.box = "horizontal",
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )
## NULL

还可以绘制箱线图:

library(dplyr)
# 箱线图对比
ggplot(return_data, aes(x = symbol, y = log_return, fill = symbol)) +
  geom_boxplot() +
  labs(title = "对数收益率箱线图对比",
       x = "股票代码",
       y = "对数收益率") +
  theme_minimal() +
  theme(legend.position = "top") 

股票数据特征的统计分析

计算波动率

library(dplyr)
volatility <- return_data %>%
  group_by(symbol) %>%
  summarise(volatility = sd(log_return, na.rm = TRUE)) %>%
  arrange(desc(volatility))

print(volatility)
## # A tibble: 4 × 2
##   symbol volatility
##   <chr>       <dbl>
## 1 NVDA       0.0330
## 2 GOOGL      0.0193
## 3 AAPL       0.0165
## 4 MSFT       0.0152

相关性分析

library(dplyr)
# 转换为宽格式
price_wide <- return_data %>%
  select(Date, symbol, price) %>%
  pivot_wider(names_from = symbol, values_from = price) %>%
  column_to_rownames(var = "Date")

# 计算相关系数矩阵
cor_matrix <- cor(price_wide)

# 可视化相关系数
library(corrplot)

# 绘制相关性矩阵(暖色调)
corrplot(cor_matrix, 
         method = "color",      # 颜色填充
         type = "upper",        # 只显示上三角
         tl.col = "black",      # 标签颜色
         tl.srt = 45,           # 标签倾斜角度
         title = "股票价格相关性矩阵", 
         mar = c(0,0,1,0),      # 边距调整
         addCoef.col = "black", # 添加相关系数数值
         number.cex = 0.7,      # 系数文字大小
         diag = FALSE)          # 不显示对角线
# 计算相关系数矩阵
cor_matrix <- cor(price_wide)

# 使用ggcorrplot绘制ggplot2风格的相关性矩阵(暖色调)
library(ggcorrplot)

ggcorrplot(
  cor_matrix,
  method = "square",          # 颜色填充
  type = "upper",            # 只显示上三角
  colors = c("#FF4500", "#FFFFFF", "#1E90FF"),  # 自定义颜色(红-白-蓝)
  lab = TRUE,                # 显示相关系数
  lab_size = 3.5,            # 系数文字大小
  title = "股票价格相关性矩阵",
  ggtheme = theme_minimal(), # ggplot2主题
  show.legend = TRUE,        # 显示图例
  legend.title = "相关性",
  tl.col = "black",          # 标签颜色
  tl.srt = 45,               # 标签倾斜角度
  digits = 2                 # 保留两位小数
) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text = element_text(size = 10),
    legend.text = element_text(size = 9),
    legend.title = element_text(size = 10, face = "bold")
  )

导出数据

# 导出为 CSV
# write_csv(stock_data, "stock_prices.csv")

# 导出为 Excel(需安装 writexl 包)
# install.packages("writexl")
# write_xlsx(stock_data, "stock_prices.xlsx")

小结

本文的数据来源为雅虎财经(Yahoo Finance),若需更专业数据,可考虑 WRDS 数据库(需机构订阅)。

在 R 软件包的选择上,我们使用了 quantmod 包以快速获取数据,但该软件包返回的是 xts 格式,后续计算过程中需转换为 tibble 。

数据处理过程借助于 tidyquant 包,该软件包可以返回整洁格式的数据,与 tidyverse 兼容性更好。

缺失值处理方面,前向填充(na.locf)适用于短期缺失,多重插补(mice包)可处理复杂缺失模式。可视化优化方面,可以使用scale_color_manual自定义颜色。此外,可以添加geom_smooth拟合趋势线(如method = “loess”)。

通过以上步骤,我们可以高效地获取、清洗并可视化多只股票的价格走势,结合波动率和相关性分析,为投资决策提供数据支持。