最近玩爬虫采集了沪深股票的财报数据。 使用了 Django-admin 来管理数据。

对于具体的财报在数据库中的存储我是这样设计表结构的:

class Report(models.Model):
    """报表"""
    id = UnsignedAutoField(primary_key=True)
    name = models.CharField('名称', max_length=128, null=True, blank=True)
    stock = models.ForeignKey('Stock', on_delete=models.CASCADE)
    report_type = models.ForeignKey(ReportType, on_delete=models.CASCADE)
    year = models.IntegerField('年度')
    quarter = models.IntegerField('季度')
    report_date = models.DateField('公布日期', null=True, blank=True)

    def __str__(self):
        return self.name

    class Meta:
        verbose_name = '单季度报表'
        verbose_name_plural = verbose_name
        unique_together = ["stock", "report_type", 'year', 'quarter']


class ReportItem(models.Model):
    """
    报表项目
    """
    NUMBER_TYPE = 1
    STRING_TYPE = 2

    VALUE_TYPES = [
        (NUMBER_TYPE, '数值'),
        (STRING_TYPE, '字符串'),
    ]

    YUAN = 1
    WYUAN = 2
    YI = 3
    GE = 4
    REN = 5
    CI = 6
    RATE = 7

    UNIT_TYPES = [
        (YUAN, '元'),
        (WYUAN, '万元'),
        (YI, '亿'),
        (GE, '个'),
        (REN, '人'),
        (CI, '次'),
        (RATE, '%'),
    ]

    id = UnsignedBigAutoField(primary_key=True)
    report = models.ForeignKey(Report, verbose_name='报表', on_delete=models.CASCADE, db_index=True)
    subject = models.ForeignKey(AccountingSubject, on_delete=models.CASCADE)
    number_value = models.DecimalField('数值', max_digits=30, decimal_places=4, null=True, blank=True)
    str_value = models.CharField('值', max_length=64, null=True, blank=True)
    value_type = models.SmallIntegerField('值类型', choices=VALUE_TYPES, null=True, blank=True)
    value_unit = models.SmallIntegerField('值单位', choices=UNIT_TYPES, null=True, blank=True)

    def __str__(self):
        return self.subject.name

    def value(self):
        if self.value_type == NUMBER_TYPE:
            return self.number_value, self.value_type

        return self.str_value, self.value_type

    class Meta:
        abstract = True
        verbose_name = '报表项目'
        verbose_name_plural = verbose_name

这不得了,我采集了2008~2019年的所有财报数据,发现 report_item 表有5千万条以上的数据。

于是对 report_item 进行了按年度分表

class DynamicModel(object):
    _models = dict()

    def __new__(cls, base_cls, db_table_suffix):
        """
        创建类
        :param base_cls: 模型基类, 要在基类 Meta 中定义 abstract = True
        :param db_table_suffix: 表后缀
        :return new_model_cls: 类
        """
        new_cls_name = f"{base_cls.__name__}_{db_table_suffix}"
        if new_cls_name not in cls._models:
            new_db_table = "{}_{}".format(base_cls._meta.db_table, db_table_suffix)
            model_cls = type(new_cls_name, (base_cls,), {'__module__': base_cls.__module__})
            model_cls._meta.db_table = new_db_table
            cls._models[new_cls_name] = model_cls

            # 不存在表则创建表
            cursor = connection.cursor()
            tables = [tableinfo.name for tableinfo in connection.introspection.get_table_list(cursor)]
            if new_db_table not in tables:
                with connection.schema_editor() as schema_editor:
                    schema_editor.create_model(model_cls)

        return cls._models[new_cls_name]

只需把模型 ReportItemMeta 中增加 abstract = True;

使用示例

class ReportPipeline(object):

    ...

    async def download_report_items(self, report, created, item):
        report_item_model = DynamicModel(ReportItem, item['report_year'])

        async def has_items():
            return report_item_model.objects.filter(report=report).exists()

        if created or not await has_items():
            items_to_insert = list()
            for slug, value in item['report_data'].items():
                subject = await self.get_subject(report.report_type, slug)

                if isinstance(value, int) or isinstance(value, float):
                    value_type = ReportItem.NUMBER_TYPE
                else:
                    value_type = ReportItem.STRING_TYPE

                items_to_insert.append(report_item_model(
                    report=report,
                    subject=subject,
                    number_value=value if value_type == ReportItem.NUMBER_TYPE else None,
                    str_value=value if value_type != ReportItem.NUMBER_TYPE else None,
                    value_type=value_type
                ))

            await self.bulk_create_items(report_item_model, items_to_insert)

标签: django

添加新评论