文章/答案/技术大牛

发布

社区首页 >问答首页 >Postgres复制命令-带逗号的字段，用双引号引号

问Postgres复制命令-带逗号的字段，用双引号引号
EN

Stack Overflow用户

提问于 2012-11-01 18:53:26

回答 2查看 5.1K关注 0票数 1

我搜索并发现了一些与postgres导入有关的帖子，但没有解决我目前的问题。

我一直使用postgres copy命令将来自异构数据源的数据带到我们的系统中。目前正在与一亿行.csv文件进行斗争，逗号引号以分隔符分隔.问题是这样的行：

009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"

带内嵌逗号的双引号括起来的字段。字段未被正确解析，我得到了错误：

"ERROR:  extra data after last expected column"

通常，当出现这种情况时，我会临时处理违规行，但是这个文件太大了，我希望有一些更通用的方法来抵御它。要求修改数据格式是不可能的。

copy mytable from '/path/to/file.csv' csv header quote '"'

postgresql

csv

copy

回答 2

Stack Overflow用户

发布于 2012-11-01 19:22:07

那是畸形的CSV。将双引号加倍以将双引号嵌入引号字段中；例如：

"where","is ""pancakes""","house?"

有三个价值：

where
is "pancakes"
house?

你遇到麻烦的那一排是双引号双引号：

009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
                                               ^^                            ^^

我不认为复制可以对此做任何事情，因为正确的版本是模棱两可的:是"this one, well, is a problem"还是"""this one, well, is a problem"""？

我想你得用手把它修好。如果您可以唯一地识别已损坏的行，那么快速的sed一行应该能够完成这项工作。

作为参考，我所看到的最接近于CSV标准的东西是RFC 4180，第二节有这样的意思：

5.  Each field may or may not be enclosed in double quotes (however
    some programs, such as Microsoft Excel, do not use double quotes
    at all).  If fields are not enclosed with double quotes, then
    double quotes may not appear inside the fields.  For example:

    "aaa","bbb","ccc" CRLF
     zzz,yyy,xxx
[...]
7.  If double-quotes are used to enclose fields, then a double-quote
    appearing inside a field must be escaped by preceding it with
    another double quote.  For example:

    "aaa","b""bb","ccc"

票数 4

Stack Overflow用户

发布于 2012-11-01 22:34:12

下面是基于Kernighan和Plauger从编程实践的CSV代码编写的代码，这些代码已经被用于处理格式怪异的CSV数据。(这并不难；我已经完成了主代码的工作和打包，所以我只需添加CSV输出函数，并修改advquoted()函数来处理这个问题中的奇怪格式。

csv2.h

/*
@(#)File:           $RCSfile: csv2.h,v $
@(#)Version:        $Revision: 2.1 $
@(#)Last changed:   $Date: 2012/11/01 22:23:07 $
@(#)Purpose:        Scanner for Comma Separated Variable (CSV) Data
@(#)Author:         J Leffler
@(#)Origin:         Kernighan & Pike, 'The Practice of Programming'
*/

/*TABSTOP=4*/

#ifndef CSV2_H
#define CSV2_H

#ifdef  __cplusplus
extern "C" {
#endif

#ifdef MAIN_PROGRAM
#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_h[] = "@(#)$Id: csv2.h,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */
#endif /* MAIN_PROGRAM */

#include <stdio.h>

extern char  *csvgetline(FILE *ifp);    /* Read next input line */
extern char  *csvgetfield(size_t n);    /* Return field n */
extern size_t csvnfield(void);          /* Return number of fields */
extern void   csvreset(void);           /* Release space used by CSV */

extern int    csvputfield(FILE *ofp, const char *field);
extern int    csvputline(FILE *ofp, char **fields, int nfields);
extern void   csvseteol(const char *eol);

#ifdef  __cplusplus
}
#endif

#endif /* CSV2_H */

csv2.c

/*
@(#)File:           $RCSfile: csv2.c,v $
@(#)Version:        $Revision: 2.1 $
@(#)Last changed:   $Date: 2012/11/01 22:23:07 $
@(#)Purpose:        Scanner for Comma Separated Variable (CSV) Data
@(#)Modification:   Deal with specific malformed CSV
@(#)Author:         J Leffler
@(#)Origin:         Kernighan & Pike, 'The Practice of Programming'
*/

/*TABSTOP=4*/

#ifndef lint
/* Prevent over-aggressive optimizers from eliminating ID string */
const char jlss_id_csv2_c[] = "@(#)$Id: csv2.c,v 2.1 2012/11/01 22:23:07 jleffler Exp $";
#endif /* lint */

/*
** See RFC 4180 (http://www.ietf.org/rfc/rfc4180.txt).
**
** Specific malformed CSV - see SO 13183644 (http://stackoverflow.com/questions/13183644).
** Data contains malformed CSV fields like: OK,""this is a problem"",OK
** Two (but not three) field quotes at the start extract as "this is a problem" (with the quotes).
*/

#include "csv2.h"
#include <stdlib.h>
#include <string.h>

enum { NOMEM = -2 };

static char *line = 0;      /* Input line */
static char *sline = 0;     /* Split line */
static size_t maxline = 0;  /* Size of line[] and sline[] */
static char **field = 0;    /* Field pointers */
static size_t maxfield = 0; /* Size of field[] */
static size_t nfield = 0;   /* Number of fields */

static char fieldsep[]= ",";    /* Field separator characters */
static char fieldquote = '"';   /* Quote character */

static char eolstr[8] = "\n";

void csvreset(void)
{
    free(line);
    free(sline);
    free(field);
    line = 0;
    sline = 0;
    field = 0;
    maxline = maxfield = nfield = 0;
}

static int endofline(FILE *ifp, int c)
{
    int eol = (c == '\r' || c == '\n');
    if (c == '\r')
    {
        c = getc(ifp);
        if (c != '\n' && c != EOF)
            ungetc(c, ifp);
    }
    return(eol);
}

/* Modified to deal with specific malformed CSV */
static char *advquoted(char *p)
{
    size_t i;
    size_t j;
    if (p[0] == fieldquote && (p[1] != *fieldsep && p[1] != fieldquote))
    {
        /* Malformed CSV: ""some stuff"" --> "some stuff" */
        /* Find "\"\"," or "\"\"\0" to mark end of field */
        /* If we don't find it, drop through to 'regular' case */
        char *eof = strstr(&p[2], "\"\"");
        if (eof != 0 && (eof[2] == *fieldsep || eof[2] == '\0'))
        {
            p[eof + 1 - p] = '\0';
            return(eof + 2);
        }
    }
    for (i = j = 0; p[j] != '\0'; i++, j++)
    {
        if (p[j] == fieldquote && p[++j] != fieldquote)
        {
            size_t k = strcspn(p+j, fieldsep);
            memmove(p+i, p+j, k);  // 1 -> i fixing transcription error
            i += k;
            j += k;
            break;
        }
        p[i] = p[j];
    }
    p[i] = '\0';
    return(p + j);
}

static int split(void)
{
    char *p;
    char **newf;
    char *sepp;
    int sepc;

    nfield = 0;
    if (line[0] == '\0')
        return(0);
    strcpy(sline, line);
    p = sline;

    do
    {
        if (nfield >= maxfield)
        {
            maxfield *= 2;
            newf = (char **)realloc(field, maxfield * sizeof(field[0]));
            if (newf == 0)
                return NOMEM;
            field = newf;
        }
        if (*p == fieldquote)
            sepp = advquoted(++p);
        else
            sepp = p + strcspn(p, fieldsep);
        sepc = sepp[0];
        sepp[0] = '\0';
        field[nfield++] = p;
        p = sepp + 1;
    } while (sepc == ',');

    return(nfield);
}

char *csvgetline(FILE *ifp)
{
    size_t i;
    int    c;

    if (line == NULL)
    {
        /* Allocate on first call */
        maxline = maxfield = 1;
        line = (char *)malloc(maxline);     /*=C++=*/
        sline = (char *)malloc(maxline);    /*=C++-*/
        field = (char **)malloc(maxfield*sizeof(field[0])); /*=C++=*/
        if (line == NULL || sline == NULL || field == NULL)
        {
            csvreset();
            return(NULL);   /* out of memory */
        }
    }
    for (i = 0; (c = getc(ifp)) != EOF && !endofline(ifp, c); i++)
    {
        if (i >= maxline - 1)
        {
            char  *newl;
            char  *news;
            maxline *= 2;
            newl = (char *)realloc(line, maxline);  /*=C++=*/
            news = (char *)realloc(sline, maxline); /*=C++-*/
            if (newl == NULL || news == NULL)
            {
                csvreset();
                return(NULL);   /* out of memory */
            }
            line = newl;
            sline = news;
        }
        line[i] = c;
    }
    line[i] = '\0';
    if (split() == NOMEM)
    {
        csvreset();
        return(NULL);
    }
    return((c == EOF && i == 0) ? NULL : line);
}


char *csvgetfield(size_t n)
{
    if (n >= nfield)
        return(0);
    return(field[n]);
}

size_t csvnfield(void)
{
    return(nfield);
}

int csvputfield(FILE *ofp, const char *ofield)
{
    const char escapes[] = "\",\r\n";
    if (strpbrk(ofield, escapes) != 0)
    {
        size_t len = strlen(ofield) + 2;
        const char *pos = ofield;
        while ((pos = strchr(pos, '"')) != 0)
        {
            len++;
            pos++;
        }
        char *space = malloc(len+1);
        if (space == 0)
            return EOF;
        char *cpy = space;
        pos = ofield;
        *cpy++ = '"';
        char c;
        while ((c = *pos++) != '\0')
        {
            if (c == '"')
                *cpy++ = c;
            *cpy++ = c;
        }
        *cpy++ = '"';
        *cpy = '\0';
        int rc = fputs(space, ofp);
        free(space);
        return rc;
    }
    else
        return fputs(ofield, ofp);
}

int csvputline(FILE *ofp, char **fields, int nfields)
{
    for (int i = 0; i < nfields; i++)
    {
        if (i > 0)
            putc(',', ofp);
        if (csvputfield(ofp, fields[i]) == EOF)
            return EOF;
    }
    return(fputs(eolstr, ofp));
}

void csvseteol(const char *eol)
{
    size_t nbytes = strlen(eol);
    if (nbytes >= sizeof(eolstr))
        nbytes = sizeof(eolstr) - 1;
    memmove(eolstr, eol, nbytes);
    eolstr[nbytes] = '\0';
}

#ifdef TEST

int main(void)
{
    char *in_line;

    while ((in_line = csvgetline(stdin)) != 0)
    {
        size_t n = csvnfield();
        char *fields[n];        /* C99 VLA */
        printf("line = '%s'\n", in_line);
        for (size_t i = 0; i < n; i++)
        {
            printf("field[%zu] = '%s'\n", i, csvgetfield(i));
            printf("field[%zu] = [", i);
            csvputfield(stdout, csvgetfield(i));
            fputs("]\n", stdout);
            fields[i] = csvgetfield(i);
        }
        printf("fields[0..%zu] = ", n-1);
        csvputline(stdout, fields, n);
    }

    return(0);
}

#endif /* TEST */

用-DTEST编译代码，创建一个带有示例main()函数的程序。您需要一个C99编译器；main()中的代码使用VLA (可变长度数组)。您可以通过动态内存分配或悲观(过高)内存分配来避免这种情况(现在几千个指针的数组不会杀死大多数系统，但几乎没有CSV文件每行有几千个字段)。

示例数据

完全基于问题中的数据。

009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"
123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky
503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"
610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"

示例输出

line = '009098,0981098094,"something","something else",""this one, well, is a problem"", "another thing"'
field[0] = '009098'
field[0] = [009098]
field[1] = '0981098094'
field[1] = [0981098094]
field[2] = 'something'
field[2] = [something]
field[3] = 'something else'
field[3] = [something else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = ' "another thing"'
field[5] = [" ""another thing"""]
fields[0..5] = 009098,0981098094,something,something else,"""this one, well, is a problem"""," ""another thing"""
line = '123458,1234561007,"anything","nothing else",""this one, well, is a problem"","dohicky'
field[0] = '123458'
field[0] = [123458]
field[1] = '1234561007'
field[1] = [1234561007]
field[2] = 'anything'
field[2] = [anything]
field[3] = 'nothing else'
field[3] = [nothing else]
field[4] = '"this one, well, is a problem"'
field[4] = ["""this one, well, is a problem"""]
field[5] = 'dohicky'
field[5] = [dohicky]
fields[0..5] = 123458,1234561007,anything,nothing else,"""this one, well, is a problem""",dohicky
line = '503458,1234598094,"nothing","everything else","""this one, well, it isn't a problem""","abelone"'
field[0] = '503458'
field[0] = [503458]
field[1] = '1234598094'
field[1] = [1234598094]
field[2] = 'nothing'
field[2] = [nothing]
field[3] = 'everything else'
field[3] = [everything else]
field[4] = '"this one, well, it isn't a problem"'
field[4] = ["""this one, well, it isn't a problem"""]
field[5] = 'abelone'
field[5] = [abelone]
fields[0..5] = 503458,1234598094,nothing,everything else,"""this one, well, it isn't a problem""",abelone
line = '610078,1236100794,"everything","anything else","this ""isn't a problem"", he said.","Orcas Rule"'
field[0] = '610078'
field[0] = [610078]
field[1] = '1236100794'
field[1] = [1236100794]
field[2] = 'everything'
field[2] = [everything]
field[3] = 'anything else'
field[3] = [anything else]
field[4] = 'this "isn't a problem", he said.'
field[4] = ["this ""isn't a problem"", he said."]
field[5] = 'Orcas Rule'
field[5] = [Orcas Rule]
fields[0..5] = 610078,1236100794,everything,anything else,"this ""isn't a problem"", he said.",Orcas Rule

字段打印两次，一次测试字段提取，一次测试字段打印。除了csvputline()将文件从格式错误的CSV转换为正确格式的CSV外，您可以通过删除打印来简化输出。

票数 2

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/13183644

复制

相似问题

问Postgres复制命令-带逗号的字段，用双引号引号
EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问Postgres复制命令-带逗号的字段，用双引号引号EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问Postgres复制命令-带逗号的字段，用双引号引号
EN