采集社交媒体热点话题，统计关键词出现频次，帮助普通人了解当前流行啥。-程序员充电站

我将为您创建一个"社交媒体热点分析器"，帮助用户了解当前流行趋势。这个程序将模拟采集和分析社交媒体数据的功能。

项目结构

social_media_analyzer/

├── main.py

├── data_collector.py

├── analyzer.py

├── visualizer.py

├── config.py

├── utils.py

├── data/

│ ├── sample_posts.json

│ ├── keywords_dict.json

│ └── hot_topics.json

├── requirements.txt

└── README.md

1. 配置文件 (config.py)

# -*- coding: utf-8 -*-

"""

社交媒体热点分析器配置文件

包含程序运行的各种配置参数

"""

import os

# 基础配置

BASE_DIR = os.path.dirname(os.path.abspath(__file__))

DATA_DIR = os.path.join(BASE_DIR, 'data')

# 文件路径配置

SAMPLE_POSTS_FILE = os.path.join(DATA_DIR, 'sample_posts.json')

KEYWORDS_DICT_FILE = os.path.join(DATA_DIR, 'keywords_dict.json')

HOT_TOPICS_FILE = os.path.join(DATA_DIR, 'hot_topics.json')

# 数据源配置（模拟）

DATA_SOURCES = {

'weibo': '微博',

'douyin': '抖音',

'xiaohongshu': '小红书',

'zhihu': '知乎',

'bilibili': 'B站'

}

# 分析配置

ANALYSIS_CONFIG = {

'min_word_length': 2, # 最小词语长度

'max_hot_keywords': 20, # 最大热门关键词数

'time_window_hours': 24, # 时间窗口（小时）

'sentiment_analysis': True, # 是否进行情感分析

'trend_detection': True, # 是否进行趋势检测

'min_frequency': 2, # 最小出现频次

'exclude_stopwords': True # 是否排除停用词

}

# 分类关键词配置

CATEGORY_KEYWORDS = {

'technology': ['AI', '人工智能', '科技', '数码', '手机', '电脑', '软件', '编程', '互联网'],

'entertainment': ['电影', '音乐', '明星', '综艺', '电视剧', '游戏', '动漫', '娱乐', '八卦'],

'lifestyle': ['美食', '旅游', '时尚', '健身', '购物', '穿搭', '护肤', '生活', '日常'],

'education': ['学习', '考试', '教育', '知识', '技能', '培训', '学校', '老师', '学生'],

'finance': ['股票', '基金', '理财', '投资', '经济', '房价', '工资', '消费', '赚钱'],

'sports': ['足球', '篮球', '运动', '健身', '比赛', '奥运', '冠军', '体育', '健康']

}

# 情感词典（简化版）

SENTIMENT_WORDS = {

'positive': ['喜欢', '爱', '棒', '好', '赞', '支持', '开心', '快乐', '美丽', '帅气', '厉害', '牛', '666', '优秀'],

'negative': ['讨厌', '恨', '差', '坏', '垃圾', '失望', '生气', '愤怒', '丑陋', '无聊', '糟糕', '坑', '骗']

}

# 停用词表（简化版）

STOP_WORDS = {

'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '他', '她', '它', '我们', '你们', '他们', '个', '些', '么', '吗', '呢', '吧', '啊', '哈', '嘿', '哦', '嗯', '呗', '啦', '呀', '哟', '喔', '呵', '嘻', '哈哈', '呵呵', '嘿嘿'

}

2. 工具函数 (utils.py)

# -*- coding: utf-8 -*-

"""

工具函数模块

提供各种辅助功能

"""

import json

import re

import time

import random

from datetime import datetime, timedelta

from collections import Counter, defaultdict

from typing import List, Dict, Any, Set, Tuple

import jieba # 中文分词库

class DataUtils:

@staticmethod

def load_json_file(file_path: str) -> Dict[str, Any]:

"""

加载JSON文件

Args:

file_path: JSON文件路径

Returns:

解析后的字典数据

"""

try:

with open(file_path, 'r', encoding='utf-8') as f:

return json.load(f)

except FileNotFoundError:

print(f"警告: 文件 {file_path} 未找到")

return {}

except json.JSONDecodeError:

print(f"错误: 文件 {file_path} 格式不正确")

return {}

@staticmethod

def save_json_file(data: Dict[str, Any], file_path: str) -> bool:

"""

保存数据到JSON文件

Args:

data: 要保存的数据

file_path: 保存路径

Returns:

保存是否成功

"""

try:

os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, 'w', encoding='utf-8') as f:

json.dump(data, f, ensure_ascii=False, indent=2)

return True

except Exception as e:

print(f"保存文件失败: {e}")

return False

@staticmethod

def clean_text(text: str) -> str:

"""

清理文本，去除特殊字符和多余空格

Args:

text: 原始文本

Returns:

清理后的文本

"""

if not text:

return ""

# 去除HTML标签

text = re.sub(r'<[^>]+>', '', text)

# 去除URL

text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\$\$,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

# 去除表情符号（保留中文、英文、数字和基本标点）

text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s.,!?;:，。！？；：#@]', '', text)

# 去除多余空格

text = re.sub(r'\s+', ' ', text.strip())

return text

@staticmethod

def extract_hashtags(text: str) -> List[str]:

"""

提取话题标签

Args:

text: 输入文本

Returns:

话题标签列表

"""

hashtags = re.findall(r'#([^#\s]+)#', text)

return [tag.strip() for tag in hashtags if len(tag.strip()) > 1]

@staticmethod

def extract_mentions(text: str) -> List[str]:

"""

提取@提及的用户

Args:

text: 输入文本

Returns:

提及用户列表

"""

mentions = re.findall(r'@([^\s@]+)', text)

return mentions

@staticmethod

def segment_chinese_text(text: str) -> List[str]:

"""

中文分词

Args:

text: 中文文本

Returns:

分词结果列表

"""

if not text:

return []

# 使用jieba进行分词

words = jieba.lcut(text)

# 过滤掉单字和停用词

filtered_words = []

for word in words:

if (len(word) >= 2 and

word not in STOP_WORDS and

not word.isdigit() and

not re.match(r'^[a-zA-Z]$', word)):

filtered_words.append(word)

return filtered_words

@staticmethod

def calculate_frequency(words: List[str]) -> Dict[str, int]:

"""

计算词频

Args:

words: 词语列表

Returns:

词频字典

"""

return dict(Counter(words))

@staticmethod

def get_current_time() -> str:

"""获取当前时间字符串"""

return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

@staticmethod

def generate_random_time(hours_ago: int = 24) -> str:

"""

生成随机时间（用于模拟数据）

Args:

hours_ago: 多少小时之前

Returns:

时间字符串

"""

random_hours = random.randint(0, hours_ago)

random_time = datetime.now() - timedelta(hours=random_hours)

return random_time.strftime("%Y-%m-%d %H:%M:%S")

# 导入停用词用于分词过滤

try:

from config import STOP_WORDS

except ImportError:

STOP_WORDS = set()

# 创建工具实例

utils = DataUtils()

3. 数据模拟器 (data_collector.py)

# -*- coding: utf-8 -*-

"""

社交媒体数据采集器（模拟版）

模拟从各大社交平台采集数据

"""

import json

import random

import time

from datetime import datetime, timedelta

from typing import List, Dict, Any

from .utils import DataUtils, utils

class SocialMediaSimulator:

def __init__(self, config_file: str = "data/sample_posts.json"):

"""

初始化数据模拟器

Args:

config_file: 样本数据文件路径

"""

self.config_file = config_file

self.sample_data = utils.load_json_file(config_file)

self.data_sources = ['weibo', 'douyin', 'xiaohongshu', 'zhihu', 'bilibili']

# 热点话题模板

self.topic_templates = {

'technology': [

"刚刚体验了最新的{}，真的太震撼了！",

"{}发布新功能，这次更新怎么样？",

"有人用过{}吗？求分享使用心得",

"{}真的是未来趋势，大家怎么看？",

"{}改变了我的生活方式，强烈推荐！"

'entertainment': [

"看了{}，剧情太精彩了！",

"{}的新作品上线了，必须追！",

"{}真的太搞笑了，笑得肚子疼",

"{}的颜值巅峰，不接受反驳",

"{}这首歌太好听了，单曲循环中"

'lifestyle': [

"{}真的太好吃了，绝绝子！",

"{}旅行攻略来了，收藏起来慢慢看",

"{}穿搭分享，今日份的精致",

"{}让我发现了生活的美好",

"{}种草清单，钱包要空了"

'education': [

"{}学习方法分享，效率提升100%",

"{}考试攻略，学姐的经验之谈",

"{}技能get，小白也能学会",

"{}改变了我的学习方式",

"{}学习打卡第{}天，坚持就是胜利"

'finance': [

"{}投资心得，新手必看",

"{}市场分析，专家这样说",

"{}理财小白入门指南",

"{}让我实现了财务自由",

"{}投资策略分享"

'sports': [

"{}比赛太精彩了！",

"{}夺冠瞬间泪目了",

"{}运动员的拼搏精神值得学习",

"{}让我爱上了这项运动",

"{}训练方法分享"

]

}

# 关键词库

self.keywords_library = {

'technology': ['AI人工智能', 'iPhone15', '华为Mate60', '小米14', 'ChatGPT', '自动驾驶', '元宇宙', '区块链', '5G网络', '云计算', '大数据', '物联网', '芯片', '新能源', '智能家居'],

'entertainment': ['流浪地球3', '热辣滚烫', '飞驰人生2', '周杰伦', '王一博', '赵丽颖', '易烊千玺', '王嘉尔', '蔡徐坤', '时代少年团', '原神', '王者荣耀', '和平精英', '甄嬛传', '狂飙'],

'lifestyle': ['淄博烧烤', '哈尔滨旅游', '三亚度假', '海底捞', '喜茶', '完美日记', '花西子', '优衣库', 'ZARA', '星巴克', '宜家家居', '戴森吹风机', 'SK-II', '兰蔻', '雅诗兰黛'],

'education': ['考研', '考公', '英语四六级', '教师资格证', 'CPA', '法考', '计算机二级', 'Python编程', '数据分析', '机器学习', '深度学习', '英语学习', '写作技巧', '演讲口才', '时间管理'],

'finance': ['比特币', '以太坊', '茅台股票', '特斯拉', '苹果股票', '基金定投', '房贷利率', '理财产品', '保险配置', '退休规划', '副业赚钱', '创业项目', '电商运营', '直播带货', '数字货币'],

'sports': ['世界杯', '奥运会', 'NBA', 'CBA', '中超联赛', '梅西', 'C罗', '谷爱凌', '苏炳添', '全红婵', '樊振东', '马龙', '张继科', '孙杨', '宁泽涛']

}

def generate_sample_post(self, category: str = None) -> Dict[str, Any]:

"""

生成一条模拟社交媒体帖子

Args:

category: 指定类别，None则随机选择

Returns:

模拟帖子数据

"""

if category is None:

category = random.choice(list(self.topic_templates.keys()))

# 选择模板和关键词

template = random.choice(self.topic_templates[category])

keyword = random.choice(self.keywords_library[category])

# 填充模板

if '{}' in template:

if '学习打卡第{}天' in template:

post_content = template.format(keyword, random.randint(1, 100))

else:

post_content = template.format(keyword)

else:

post_content = template

# 添加一些随机元素

if random.random() < 0.3: # 30%概率添加话题标签

hashtags = [f"#{keyword}", f"#{category}"]

post_content += " " + " ".join(hashtags)

if random.random() < 0.2: # 20%概率@用户

mentions = ["@小明同学", "@小红薯", "@科技达人", "@娱乐圈", "@生活家"]

post_content += " " + random.choice(mentions)

# 生成用户信息

users = ["科技爱好者", "追星女孩", "美食达人", "学习博主", "投资小白", "运动健将", "旅行者", "时尚icon", "游戏玩家", "电影迷"]

post_data = {

'id': f"post_{int(time.time())}_{random.randint(1000, 9999)}",

'platform': random.choice(self.data_sources),

'user': random.choice(users),

'content': post_content,

'timestamp': utils.generate_random_time(24),

'likes': random.randint(0, 10000),

'shares': random.randint(0, 1000),

'comments': random.randint(0, 500),

'category': category,

'keywords': [keyword]

}

return post_data

def collect_data(self, num_posts: int = 100, categories: List[str] = None) -> List[Dict[str, Any]]:

"""

采集模拟数据

Args:

num_posts: 采集帖子数量

categories: 指定类别列表，None则包含所有类别

Returns:

帖子数据列表

"""

if categories is None:

categories = list(self.topic_templates.keys())

posts = []

posts_per_category = num_posts // len(categories)

for category in categories:

for _ in range(posts_per_category):

post = self.generate_sample_post(category)

posts.append(post)

# 补充剩余帖子

remaining = num_posts - len(posts)

for _ in range(remaining):

post = self.generate_sample_post()

posts.append(post)

# 打乱顺序

random.shuffle(posts)

return posts

def save_sample_data(self, posts: List[Dict[str, Any]]) -> bool:

"""

保存样本数据到文件

Args:

posts: 帖子数据列表

Returns:

保存是否成功

"""

data = {

'collection_time': utils.get_current_time(),

'total_posts': len(posts),

'posts': posts

}

return utils.save_json_file(data, self.config_file)

def load_existing_data(self) -> List[Dict[str, Any]]:

"""

加载现有的样本数据

Returns:

帖子数据列表

"""

if not self.sample_data:

return []

return self.sample_data.get('posts', [])

def update_hot_topics(self, posts: List[Dict[str, Any]]) -> Dict[str, Any]:

"""

更新热点话题数据

Args:

posts: 帖子数据列表

Returns:

热点话题统计

"""

# 统计关键词出现频次

keyword_counter = Counter()

category_counter = Counter()

hashtag_counter = Counter()

for post in posts:

# 统计关键词

keywords = post.get('keywords', [])

keyword_counter.update(keywords)

# 统计类别

category = post.get('category', 'unknown')

category_counter[category] += 1

# 统计话题标签

content = post.get('content', '')

hashtags = utils.extract_hashtags(content)

hashtag_counter.update(hashtags)

# 获取热门关键词（前20）

hot_keywords = dict(keyword_counter.most_common(20))

# 获取热门话题标签（前10）

hot_hashtags = dict(hashtag_counter.most_common(10))

hot_topics_data = {

'update_time': utils.get_current_time(),

'total_posts': len(posts),

'hot_keywords': hot_keywords,

'hot_hashtags': hot_hashtags,

'category_distribution': dict(category_counter),

'trending_up': self._identify_trending_topics(keyword_counter, posts),

'emerging_keywords': self._find_emerging_keywords(posts)

}

return hot_topics_data

def _identify_trending_topics(self, keyword_counter: Counter, posts: List[Dict[str, Any]]) -> List[str]:

"""

识别趋势上升的话题

Args:

keyword_counter: 关键词计数器

posts: 帖子数据

Returns:

趋势上升的关键词列表

"""

# 简化版趋势识别：选择出现频次较高的关键词

trending = [kw for kw, count in keyword_counter.most_common(10) if count >= 3]

return trending

def _find_emerging_keywords(self, posts: List[Dict[str, Any]]) -> List[str]:

"""

发现新兴关键词

Args:

posts: 帖子数据

Returns:

新兴关键词列表

"""

# 简化版新兴关键词发现：选择最近出现的独特关键词

recent_posts = [p for p in posts if self._is_recent_post(p.get('timestamp'))]

emerging = set()

for post in recent_posts:

keywords = post.get('keywords', [])

emerging.update(keywords)

return list(emerging)[:10]

def _is_recent_post(self, timestamp: str) -> bool:

"""

判断是否为最近的帖子（6小时内）

Args:

timestamp: 时间戳

Returns:

是否为最近帖子

"""

try:

post_time = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")

six_hours_ago = datetime.now() - timedelta(hours=6)

return post_time > six_hours_ago

except:

return False

4. 数据分析器 (analyzer.py)

# -*- coding: utf-8 -*-

"""

社交媒体数据分析器

负责分析采集到的数据，提取热点信息

"""

import re

from datetime import datetime, timedelta

from typing import List, Dict, Any, Tuple, Set

from collections import Counter, defaultdict

from .utils import DataUtils, utils

class HotTopicAnalyzer:

def __init__(self):

"""初始化分析器"""

self.stop_words = utils.STOP_WORDS if hasattr(utils, 'STOP_WORDS') else set()

self.sentiment_words = self._load_sentiment_words()

def _load_sentiment_words(self) -> Dict[str, List[str]]:

"""加载情感词典"""

try:

from config import SENTIMENT_WORDS

return SENTIMENT_WORDS

except ImportError:

return {

'positive': ['喜欢', '爱', '棒', '好', '赞', '支持', '开心', '快乐'],

'negative': ['讨厌', '恨', '差', '坏', '垃圾', '失望', '生气', '愤怒']

}

def analyze_posts(self, posts: List[Dict[str, Any]]) -> Dict[str, Any]:

"""

分析帖子数据

Args:

posts: 帖子数据列表

Returns:

分析结果

"""

if not posts:

return self._empty_analysis_result()

# 提取所有文本内容

all_text = " ".join([post.get('content', '') for post in posts])

# 分词

words = utils.segment_chinese_text(all_text)

# 计算词频

word_freq = utils.calculate_frequency(words)

# 提取话题标签

hashtags = []

for post in posts:

content = post.get('content', '')

tags = utils.extract_hashtags(content)

hashtags.extend(tags)

hashtag_freq = utils.calculate_frequency(hashtags)

# 提取@提及

mentions = []

for post in posts:

content = post.get('content', '')

user_mentions = utils.extract_mentions(content)

mentions.extend(user_mentions)

mention_freq = utils.calculate_frequency(mentions)

# 情感分析

sentiment_analysis = self._analyze_sentiment(posts)

# 时间趋势分析

time_trend = self._analyze_time_trend(posts)

# 平台分布分析

platform_dist = self._analyze_platform_distribution(posts)

# 互动数据统计

interaction_stats = self._analyze_interactions(posts)

# 获取热门关键词（过滤低频词）

min_freq = 2 # 最小出现频次

hot_keywords = {k: v for k, v in word_freq.items() if v >= min_freq}

hot_keywords = dict(sorted(hot_keywords.items(), key=lambda x: x[1], reverse=True)[:20])

analysis_result = {

'analysis_time': utils.get_current_time(),

'total_posts': len(posts),

'hot_keywords': hot_keywords,

'hot_hashtags': dict(sorted(hashtag_freq.items(), key=lambda x: x[1], reverse=True)[:10]),

'hot_mentions': dict(sorted(mention_freq.items(), key=lambda x: x[1], reverse=True)[:10]),

'sentiment_analysis': sentiment_analysis,

'time_trend': time_trend,

'platform_distribution': platform_dist,

'interaction_stats': interaction_stats,

'category_analysis': self._analyze_categories(posts),

'trending_keywords': self._identify_trending_keywords(posts, word_freq)

}

return analysis_result

def _empty_analysis_result(self) -> Dict[str, Any]:

"""返回空的后分析结果为空时的默认返回值"""

return {

'analysis_time': utils.get_current_time(),

'total_posts': 0,

'hot_keywords': {},

'hot_hashtags': {},

'hot_mentions': {},

'sentiment_analysis': {'positive': 0, 'negative': 0, 'neutral': 0},

'time_trend': {},

'platform_distribution': {},

'interaction_stats': {},

'category_analysis': {},

'trending_keywords': []

}

def _analyze_sentiment(self, posts: List[Dict[str, Any]]) -> Dict[str, int]:

"""

分析情感倾向

Args:

posts: 帖子数据列表

Returns:

情感分析结果

"""

sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0}

for post in posts:

content = post.get('content', '')

positive_score = sum(1 for word in self.sentiment_words['positive'

关注我，有更多实用程序等着你！

采集社交媒体热点话题，统计关键词出现频次，帮助普通人了解当前流行啥。

python+locust电商全流程性能测试

GitHub Template仓库快速初始化TensorFlow项目

GitHub Releases发布TensorFlow项目正式版本

GitHub上最受欢迎的TensorFlow-v2.9项目合集分享

HTML页面集成TensorFlow.js：实现浏览器端模型推理

Docker安装Ubuntu镜像并部署TensorFlow-v2.9环境