@@ -1641,6 +1641,251 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
1641
1641
return 0 ;
1642
1642
}
1643
1643
1644
+ // custom line terminator
1645
+ int tokenize_whitespace_customterm (parser_t * self , size_t line_limit )
1646
+ {
1647
+ int i , slen , start_lines ;
1648
+ long maxstreamsize ;
1649
+ char c ;
1650
+ char * stream ;
1651
+ char * buf = self -> data + self -> datapos ;
1652
+
1653
+ start_lines = self -> lines ;
1654
+
1655
+ if (make_stream_space (self , self -> datalen - self -> datapos ) < 0 ) {
1656
+ self -> error_msg = "out of memory" ;
1657
+ return -1 ;
1658
+ }
1659
+
1660
+ stream = self -> stream + self -> stream_len ;
1661
+ slen = self -> stream_len ;
1662
+ maxstreamsize = self -> stream_cap ;
1663
+
1664
+ TRACE (("%s\n" , buf ));
1665
+
1666
+ for (i = self -> datapos ; i < self -> datalen ; ++ i )
1667
+ {
1668
+ // next character in file
1669
+ c = * buf ++ ;
1670
+
1671
+ TRACE (("tokenize_whitespace_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n" ,
1672
+ i , c , self -> file_lines + 1 , self -> line_fields [self -> lines ],
1673
+ self -> state ));
1674
+
1675
+ switch (self -> state ) {
1676
+
1677
+ case SKIP_LINE :
1678
+ if (c == self -> lineterminator ) {
1679
+ END_LINE ();
1680
+ }
1681
+ break ;
1682
+
1683
+ case WHITESPACE_LINE :
1684
+ if (c == self -> lineterminator ) {
1685
+ self -> file_lines ++ ;
1686
+ self -> state = START_RECORD ;
1687
+ break ;
1688
+ }
1689
+ // fall through
1690
+
1691
+ case EAT_WHITESPACE :
1692
+ if (c == self -> lineterminator ) {
1693
+ END_LINE ();
1694
+ self -> state = START_RECORD ;
1695
+ break ;
1696
+ } else if (!IS_WHITESPACE (c )) {
1697
+ self -> state = START_FIELD ;
1698
+ // fall through to subsequent state
1699
+ } else {
1700
+ // if whitespace char, keep slurping
1701
+ break ;
1702
+ }
1703
+
1704
+ case START_RECORD :
1705
+ // start of record
1706
+ if (skip_this_line (self , self -> file_lines )) {
1707
+ self -> state = SKIP_LINE ;
1708
+ if (c == self -> lineterminator ) {
1709
+ END_LINE ();
1710
+ }
1711
+ break ;
1712
+ } else if (c == self -> lineterminator ) {
1713
+ if (self -> skip_empty_lines ) {
1714
+ self -> file_lines ++ ;
1715
+ } else {
1716
+ END_LINE ();
1717
+ }
1718
+ break ;
1719
+ } else if (IS_WHITESPACE (c )) {
1720
+ if (self -> skip_empty_lines )
1721
+ self -> state = WHITESPACE_LINE ;
1722
+ else
1723
+ self -> state = EAT_WHITESPACE ;
1724
+ break ;
1725
+ } else if (c == self -> commentchar ) {
1726
+ self -> state = EAT_LINE_COMMENT ;
1727
+ break ;
1728
+ } else {
1729
+ // nominal character - handle as START_FIELD
1730
+ self -> state = START_FIELD ;
1731
+ }
1732
+ // fall through
1733
+
1734
+ case START_FIELD :
1735
+ // expecting field
1736
+ if (c == self -> lineterminator ) {
1737
+ END_FIELD ();
1738
+ END_LINE ();
1739
+ } else if (c == self -> quotechar &&
1740
+ self -> quoting != QUOTE_NONE ) {
1741
+ // start quote field
1742
+ self -> state = IN_QUOTED_FIELD ;
1743
+ } else if (c == self -> escapechar ) {
1744
+ // possible escaped character
1745
+ self -> state = ESCAPED_CHAR ;
1746
+ } else if (IS_WHITESPACE (c )) {
1747
+ self -> state = EAT_WHITESPACE ;
1748
+ } else if (c == self -> commentchar ) {
1749
+ END_FIELD ();
1750
+ self -> state = EAT_COMMENT ;
1751
+ } else {
1752
+ // begin new unquoted field
1753
+ if (self -> quoting == QUOTE_NONNUMERIC )
1754
+ self -> numeric_field = 1 ;
1755
+
1756
+ PUSH_CHAR (c );
1757
+ self -> state = IN_FIELD ;
1758
+ }
1759
+ break ;
1760
+
1761
+ case EAT_LINE_COMMENT :
1762
+ if (c == self -> lineterminator ) {
1763
+ self -> file_lines ++ ;
1764
+ self -> state = START_RECORD ;
1765
+ }
1766
+ break ;
1767
+
1768
+ case ESCAPED_CHAR :
1769
+ PUSH_CHAR (c );
1770
+ self -> state = IN_FIELD ;
1771
+ break ;
1772
+
1773
+ case IN_FIELD :
1774
+ // in unquoted field
1775
+ if (c == self -> lineterminator ) {
1776
+ END_FIELD ();
1777
+ END_LINE ();
1778
+ } else if (c == self -> escapechar ) {
1779
+ // possible escaped character
1780
+ self -> state = ESCAPED_CHAR ;
1781
+ } else if (IS_WHITESPACE (c )) {
1782
+ // end of field (end of line not reached yet)
1783
+ END_FIELD ();
1784
+ self -> state = EAT_WHITESPACE ;
1785
+ } else if (c == self -> commentchar ) {
1786
+ END_FIELD ();
1787
+ self -> state = EAT_COMMENT ;
1788
+ } else {
1789
+ // normal character - save in field
1790
+ PUSH_CHAR (c );
1791
+ }
1792
+ break ;
1793
+
1794
+ case IN_QUOTED_FIELD :
1795
+ // in quoted field
1796
+ if (c == self -> escapechar ) {
1797
+ // possible escape character
1798
+ self -> state = ESCAPE_IN_QUOTED_FIELD ;
1799
+ } else if (c == self -> quotechar &&
1800
+ self -> quoting != QUOTE_NONE ) {
1801
+ if (self -> doublequote ) {
1802
+ // double quote - " represented by ""
1803
+ self -> state = QUOTE_IN_QUOTED_FIELD ;
1804
+ }
1805
+ else {
1806
+ // end of quote part of field
1807
+ self -> state = IN_FIELD ;
1808
+ }
1809
+ } else {
1810
+ // normal character - save in field
1811
+ PUSH_CHAR (c );
1812
+ }
1813
+ break ;
1814
+
1815
+ case ESCAPE_IN_QUOTED_FIELD :
1816
+ PUSH_CHAR (c );
1817
+ self -> state = IN_QUOTED_FIELD ;
1818
+ break ;
1819
+
1820
+ case QUOTE_IN_QUOTED_FIELD :
1821
+ // double quote - seen a quote in an quoted field
1822
+ if (self -> quoting != QUOTE_NONE && c == self -> quotechar ) {
1823
+ // save "" as "
1824
+ PUSH_CHAR (c );
1825
+ self -> state = IN_QUOTED_FIELD ;
1826
+ } else if (IS_WHITESPACE (c )) {
1827
+ // end of field (end of line not reached yet)
1828
+ END_FIELD ();
1829
+ self -> state = EAT_WHITESPACE ;
1830
+ } else if (c == self -> lineterminator ) {
1831
+ END_FIELD ();
1832
+ END_LINE ();
1833
+ } else if (!self -> strict ) {
1834
+ PUSH_CHAR (c );
1835
+ self -> state = IN_FIELD ;
1836
+ } else {
1837
+ self -> error_msg = (char * ) malloc (50 );
1838
+ sprintf (self -> error_msg , "'%c' expected after '%c'" ,
1839
+ self -> delimiter , self -> quotechar );
1840
+ goto parsingerror ;
1841
+ }
1842
+ break ;
1843
+
1844
+ case EAT_CRNL :
1845
+ if (c == self -> lineterminator ) {
1846
+ END_LINE ();
1847
+ } else if (IS_WHITESPACE (c )){
1848
+ // Handle \r-delimited files
1849
+ END_LINE_STATE (EAT_WHITESPACE );
1850
+ } else {
1851
+ /* XXX
1852
+ * first character of a new record--need to back up and reread
1853
+ * to handle properly...
1854
+ */
1855
+ i -- ; buf -- ; // back up one character (HACK!)
1856
+ END_LINE_STATE (START_RECORD );
1857
+ }
1858
+ break ;
1859
+
1860
+ case EAT_COMMENT :
1861
+ if (c == self -> lineterminator ) {
1862
+ END_LINE ();
1863
+ }
1864
+ break ;
1865
+
1866
+ default :
1867
+ break ;
1868
+ }
1869
+ }
1870
+
1871
+ _TOKEN_CLEANUP ();
1872
+
1873
+ TRACE (("Finished tokenizing input\n" ))
1874
+
1875
+ return 0 ;
1876
+
1877
+ parsingerror :
1878
+ i ++ ;
1879
+ _TOKEN_CLEANUP ();
1880
+
1881
+ return -1 ;
1882
+
1883
+ linelimit :
1884
+ i ++ ;
1885
+ _TOKEN_CLEANUP ();
1886
+
1887
+ return 0 ;
1888
+ }
1644
1889
1645
1890
static int parser_handle_eof (parser_t * self ) {
1646
1891
TRACE (("handling eof, datalen: %d, pstate: %d\n" , self -> datalen , self -> state ))
@@ -1851,11 +2096,17 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
1851
2096
int start_lines = self -> lines ;
1852
2097
1853
2098
if (self -> delim_whitespace ) {
1854
- tokenize_bytes = tokenize_whitespace ;
1855
- } else if (self -> lineterminator == '\0' ) {
1856
- tokenize_bytes = tokenize_delimited ;
2099
+ if (self -> lineterminator == '\0' ) {
2100
+ tokenize_bytes = tokenize_whitespace ;
2101
+ } else {
2102
+ tokenize_bytes = tokenize_whitespace_customterm ;
2103
+ }
1857
2104
} else {
1858
- tokenize_bytes = tokenize_delim_customterm ;
2105
+ if (self -> lineterminator == '\0' ) {
2106
+ tokenize_bytes = tokenize_delimited ;
2107
+ } else {
2108
+ tokenize_bytes = tokenize_delim_customterm ;
2109
+ }
1859
2110
}
1860
2111
1861
2112
if (self -> state == FINISHED ) {
0 commit comments